aco: allow to load/store 16-bit values in VMEM for tess and geom

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 48859686b9ca4488fc7c464b77541bc1905ef460..5346376a48de9c423b6ac08d152a5e71644f624f 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -304,20 +304,21 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
        return;
     if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
        return;
-   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
-   split->operands[0] = Operand(vec_src);
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
     RegClass rc;
     if (num_components > vec_src.size()) {
-      if (vec_src.type() == RegType::sgpr)
+      if (vec_src.type() == RegType::sgpr) {
+         /* should still help get_alu_src() */
+         emit_split_vector(ctx, vec_src, vec_src.size());
           return;
-
+      }
        /* sub-dword split */
-      assert(vec_src.type() == RegType::vgpr);
        rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
     } else {
        rc = RegClass(vec_src.type(), vec_src.size() / num_components);
     }
+   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
+   split->operands[0] = Operand(vec_src);
+   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
     for (unsigned i = 0; i < num_components; i++) {
        elems[i] = {ctx->program->allocateId(), rc};
        split->definitions[i] = Definition(elems[i]);
@@ -501,10 +502,11 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
           return vec;
  
        Temp dst{ctx->program->allocateId(), s1};
-      aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 1)};
+      aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
        bfe->operands[0] = Operand(vec);
        bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
        bfe->definitions[0] = Definition(dst);
+      bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
        ctx->block->instructions.emplace_back(std::move(bfe));
        return dst;
     }
@@ -709,9 +711,8 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
  {
     aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
     aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
-   bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
     bool use_valu = s_op == aco_opcode::num_opcodes ||
-                   divergent_vals ||
+                   nir_dest_is_divergent(instr->dest.dest) ||
                     ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
                     ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
     aco_opcode op = use_valu ? v_op : s_op;
@@ -748,18 +749,12 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
  
     if (dst.type() == RegType::vgpr) {
        aco_ptr<Instruction> bcsel;
-      if (dst.regClass() == v2b) {
-         then = as_vgpr(ctx, then);
-         els = as_vgpr(ctx, els);
-
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), els, then, cond);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
-      } else if (dst.regClass() == v1) {
+      if (dst.size() == 1) {
           then = as_vgpr(ctx, then);
           els = as_vgpr(ctx, els);
  
           bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
-      } else if (dst.regClass() == v2) {
+      } else if (dst.size() == 2) {
           Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
           Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
@@ -783,7 +778,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
        assert(els.regClass() == bld.lm);
     }
  
-   if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
+   if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
        if (dst.regClass() == s1 || dst.regClass() == s2) {
           assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
           assert(dst.size() == then.size());
@@ -882,7 +877,8 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
  
     /* Extract the exponent and compute the unbiased value. */
-   Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val);
+   Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
+   exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
  
     /* Extract the fractional part. */
     Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
@@ -898,7 +894,7 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
  
     /* Get the sign bit. */
-   Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi);
+   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
  
     /* Decide the operation to apply depending on the unbiased exponent. */
     Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
@@ -1015,8 +1011,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  
        if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
-         for (unsigned i = 0; i < num; ++i)
-            vec->operands[i] = Operand{elems[i]};
+         RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
+         for (unsigned i = 0; i < num; ++i) {
+            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
+               vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
+            else
+               vec->operands[i] = Operand{elems[i]};
+         }
           vec->definitions[0] = Definition(dst);
           ctx->block->instructions.emplace_back(std::move(vec));
           ctx->allocated_vec.emplace(dst.id(), elems);
@@ -1594,9 +1595,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
        } else if (dst.regClass() == v2) {
@@ -1612,9 +1611,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
        } else if (dst.regClass() == v2) {
@@ -1630,12 +1627,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = get_alu_src(ctx, instr->src[1]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
           if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false);
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
           else
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
        } else if (dst.regClass() == v1) {
           if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
              emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
@@ -1658,9 +1653,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
           // TODO: check fp_mode.must_flush_denorms16_64
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
        } else if (dst.regClass() == v2) {
@@ -1682,9 +1675,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
           // TODO: check fp_mode.must_flush_denorms16_64
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
        } else if (dst.regClass() == v2) {
@@ -1703,9 +1694,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmax3: {
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
@@ -1717,9 +1706,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmin3: {
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
@@ -1731,9 +1718,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fmed3: {
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
@@ -1832,8 +1817,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_frsq: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_rsq(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
@@ -1848,8 +1832,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fneg: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src));
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
              src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
@@ -1871,8 +1854,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fabs: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src));
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
              src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
@@ -1894,8 +1876,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fsat: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), Operand(0x3f800000u), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
        } else if (dst.regClass() == v1) {
           bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
           /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
@@ -1914,8 +1895,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_flog2: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_log2(ctx, bld, Definition(dst), src);
        } else {
@@ -1928,8 +1908,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_frcp: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_rcp(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
@@ -1943,9 +1922,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_fexp2: {
        if (dst.regClass() == v2b) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
        } else {
@@ -1958,8 +1935,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fsqrt: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_sqrt(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
@@ -1973,9 +1949,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     }
     case nir_op_ffract: {
        if (dst.regClass() == v2b) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
        } else if (dst.regClass() == v2) {
@@ -1990,8 +1964,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_ffloor: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
        } else if (dst.regClass() == v2) {
@@ -2006,8 +1979,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fceil: {
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
        } else if (dst.regClass() == v2) {
@@ -2037,8 +2009,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_ftrunc: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
        } else if (dst.regClass() == v2) {
@@ -2053,8 +2024,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fround_even: {
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
        } else if (dst.regClass() == v2) {
@@ -2099,8 +2069,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.regClass() == v2b) {
           Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
           aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
-         tmp = bld.vop1(opcode, bld.def(v1), tmp);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop1(opcode, Definition(dst), tmp);
        } else if (dst.regClass() == v1) {
           Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
  
@@ -2121,9 +2090,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = get_alu_src(ctx, instr->src[1]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
        } else if (dst.regClass() == v1) {
           bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
        } else if (dst.regClass() == v2) {
@@ -2138,8 +2105,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_frexp_sig: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
        } else if (dst.regClass() == v1) {
           bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
        } else if (dst.regClass() == v2) {
@@ -2176,8 +2142,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
           cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), minus_one, src, cond);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
        } else if (dst.regClass() == v1) {
           Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
@@ -2205,16 +2170,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 64)
           src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+      bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
        break;
     }
     case nir_op_f2f16_rtz: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 64)
           src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u));
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+      bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
        break;
     }
     case nir_op_f2f32: {
@@ -2241,8 +2204,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 8)
           src = convert_int(bld, src, 8, 16, true);
-      Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+      bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
        break;
     }
     case nir_op_i2f32: {
@@ -2281,8 +2243,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 8)
           src = convert_int(bld, src, 8, 16, false);
-      Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+      bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
        break;
     }
     case nir_op_u2f32: {
@@ -2574,8 +2535,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
        } else if (dst.regClass() == v2b) {
           Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
        } else {
           unreachable("Wrong destination register class for nir_op_b2f16.");
        }
@@ -3825,6 +3785,29 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
  }
  
  
+aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+{
+   switch (bytes) {
+   case 1:
+      assert(!smem);
+      return aco_opcode::buffer_store_byte;
+   case 2:
+      assert(!smem);
+      return aco_opcode::buffer_store_short;
+   case 4:
+      return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+   case 8:
+      return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+   case 12:
+      assert(!smem);
+      return aco_opcode::buffer_store_dwordx3;
+   case 16:
+      return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+   }
+   unreachable("Unexpected store size");
+   return aco_opcode::num_opcodes;
+}
+
  void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
                          Temp data, unsigned writemask, int swizzle_element_size,
                          unsigned *write_count, Temp *write_datas, unsigned *offsets)
@@ -3940,7 +3923,7 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
     assert(vdata.size() >= 1 && vdata.size() <= 4);
  
     Builder bld(ctx->program, ctx->block);
-   aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
+   aco_opcode op = get_buffer_store_op(false, vdata.bytes());
     const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
  
     Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
@@ -3957,7 +3940,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
                                     bool allow_combining = true, bool reorder = true, bool slc = false)
  {
     Builder bld(ctx->program, ctx->block);
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
     assert(write_mask);
     write_mask = widen_mask(write_mask, elem_size_bytes);
  
@@ -3977,8 +3960,8 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset,
                       unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
                       unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
  {
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
-   assert((num_components * elem_size_bytes / 4) == dst.size());
+   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
+   assert((num_components * elem_size_bytes) == dst.bytes());
     assert(!!stride != allow_combining);
  
     Builder bld(ctx->program, ctx->block);
@@ -4123,11 +4106,9 @@ std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intri
     Builder bld(ctx->program, ctx->block);
  
     uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
-   uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
-   uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
-   uint32_t output_vertex_size = num_tcs_outputs * 16;
+   uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
     uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
-   uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+   uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
  
     std::pair<Temp, unsigned> offs = instr
                                      ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
@@ -4175,11 +4156,7 @@ std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx
  {
     Builder bld(ctx->program, ctx->block);
  
-   unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
-                              ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
-                              : ctx->args->options->key.tes.tcs_num_outputs;
-
-   unsigned output_vertex_size = num_tcs_outputs * 16;
+   unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
     unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
     unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
     unsigned attr_stride = ctx->tcs_num_patches;
@@ -4200,10 +4177,12 @@ std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx
  
  bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
  {
+   assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
     if (mask == 0)
        return false;
  
-   unsigned off = nir_intrinsic_base(instr) * 4u;
+   unsigned drv_loc = nir_intrinsic_base(instr);
     nir_src *off_src = nir_get_io_offset_src(instr);
  
     if (!nir_src_is_const(*off_src)) {
@@ -4212,15 +4191,10 @@ bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr
     }
  
     *indirect = false;
-   off += nir_src_as_uint(*off_src) * 16u;
-
-   while (mask) {
-      unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0);
-      if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u)
-         return true;
-   }
-
-   return false;
+   uint64_t slot = per_vertex
+                   ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
+                   : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
+   return (((uint64_t) 1) << slot) & mask;
  }
  
  bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -4239,10 +4213,12 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
     if (instr->src[0].ssa->bit_size == 64)
        write_mask = widen_mask(write_mask, 2);
  
+   RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
+
     for (unsigned i = 0; i < 8; ++i) {
        if (write_mask & (1 << i)) {
           ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
-         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
+         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
        }
        idx++;
     }
@@ -4314,9 +4290,8 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
           /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
            * GFX9+: LS is merged into HS, but still uses the same LDS layout.
            */
-         unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
           Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
-         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
+         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
        } else {
           unreachable("Invalid LS or ES stage");
        }
@@ -5036,7 +5011,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
     Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
-   if (!ctx->divergent_vals[instr->dest.ssa.index])
+   if (!nir_dest_is_divergent(instr->dest))
        index = bld.as_uniform(index);
     unsigned desc_set = nir_intrinsic_desc_set(instr);
     unsigned binding = nir_intrinsic_binding(instr);
@@ -5629,8 +5604,12 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
     ctx->block->instructions.emplace_back(std::move(load));
  
     Operand sample_index4;
-   if (sample_index.isConstant() && sample_index.constantValue() < 16) {
-      sample_index4 = Operand(sample_index.constantValue() << 2);
+   if (sample_index.isConstant()) {
+      if (sample_index.constantValue() < 16) {
+         sample_index4 = Operand(sample_index.constantValue() << 2);
+      } else {
+         sample_index4 = Operand(0u);
+      }
     } else if (sample_index.regClass() == s1) {
        sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
     } else {
@@ -6108,7 +6087,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
-   bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
+   bool smem = !nir_src_is_divergent(instr->src[2]) &&
                 ctx->options->chip_class >= GFX8 &&
                 elem_size_bytes >= 4;
     if (smem)
@@ -6122,38 +6101,12 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
                        data, writemask, 16, &write_count, write_datas, offsets);
  
     for (unsigned i = 0; i < write_count; i++) {
-      aco_opcode vmem_op, smem_op = aco_opcode::last_opcode;
-      switch (write_datas[i].bytes()) {
-         case 1:
-            vmem_op = aco_opcode::buffer_store_byte;
-            break;
-         case 2:
-            vmem_op = aco_opcode::buffer_store_short;
-            break;
-         case 4:
-            vmem_op = aco_opcode::buffer_store_dword;
-            smem_op = aco_opcode::s_buffer_store_dword;
-            break;
-         case 8:
-            vmem_op = aco_opcode::buffer_store_dwordx2;
-            smem_op = aco_opcode::s_buffer_store_dwordx2;
-            break;
-         case 12:
-            vmem_op = aco_opcode::buffer_store_dwordx3;
-            assert(!smem && ctx->options->chip_class > GFX6);
-            break;
-         case 16:
-            vmem_op = aco_opcode::buffer_store_dwordx4;
-            smem_op = aco_opcode::s_buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("Store SSBO not implemented for this size.");
-      }
-      if (ctx->stage == fragment_fs)
-         smem_op = aco_opcode::p_fs_buffer_store_smem;
+      aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
+      if (smem && ctx->stage == fragment_fs)
+         op = aco_opcode::p_fs_buffer_store_smem;
  
        if (smem) {
-         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
+         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
           store->operands[0] = Operand(rsrc);
           if (offsets[i]) {
              Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
@@ -6162,7 +6115,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           } else {
              store->operands[1] = Operand(offset);
           }
-         if (smem_op != aco_opcode::p_fs_buffer_store_smem)
+         if (op != aco_opcode::p_fs_buffer_store_smem)
              store->operands[1].setFixed(m0);
           store->operands[2] = Operand(write_datas[i]);
           store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
@@ -6171,12 +6124,12 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           store->barrier = barrier_buffer;
           ctx->block->instructions.emplace_back(std::move(store));
           ctx->program->wb_smem_l1_on_end = true;
-         if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
+         if (op == aco_opcode::p_fs_buffer_store_smem) {
              ctx->block->kind |= block_kind_needs_lowering;
              ctx->program->needs_exact = true;
           }
        } else {
-         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
+         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
           store->operands[0] = Operand(rsrc);
           store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
           store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
@@ -6319,38 +6272,25 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
  
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
  
     if (ctx->options->chip_class >= GFX7)
        addr = as_vgpr(ctx, addr);
  
-   unsigned writemask = nir_intrinsic_write_mask(instr);
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      if (count == 3 && ctx->options->chip_class == GFX6) {
-         /* GFX6 doesn't support storing vec3, split it. */
-         writemask |= 1u << (start + 2);
-         count = 2;
-      }
-      unsigned num_bytes = count * elem_size_bytes;
-
-      Temp write_data = data;
-      if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++)
-            vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
-         write_data = bld.tmp(RegType::vgpr, count);
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      }
-
-      bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
-      unsigned offset = start * elem_size_bytes;
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+                      16, &write_count, write_datas, offsets);
  
+   for (unsigned i = 0; i < write_count; i++) {
        if (ctx->options->chip_class >= GFX7) {
+         unsigned offset = offsets[i];
+         Temp store_addr = addr;
           if (offset > 0 && ctx->options->chip_class < GFX9) {
              Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
              Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
@@ -6363,14 +6303,20 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
                       Operand(0u), addr1,
                       carry).def(1).setHint(vcc);
  
-            addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
+            store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
  
              offset = 0;
           }
  
           bool global = ctx->options->chip_class >= GFX9;
           aco_opcode op;
-         switch (num_bytes) {
+         switch (write_datas[i].bytes()) {
+         case 1:
+            op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
+            break;
+         case 2:
+            op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
+            break;
           case 4:
              op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
              break;
@@ -6388,9 +6334,9 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           }
  
           aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
-         flat->operands[0] = Operand(addr);
+         flat->operands[0] = Operand(store_addr);
           flat->operands[1] = Operand(s1);
-         flat->operands[2] = Operand(data);
+         flat->operands[2] = Operand(write_datas[i]);
           flat->glc = glc;
           flat->dlc = false;
           flat->offset = offset;
@@ -6401,20 +6347,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
        } else {
           assert(ctx->options->chip_class == GFX6);
  
-         aco_opcode op;
-         switch (num_bytes) {
-         case 4:
-            op = aco_opcode::buffer_store_dword;
-            break;
-         case 8:
-            op = aco_opcode::buffer_store_dwordx2;
-            break;
-         case 16:
-            op = aco_opcode::buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("store_global not implemented for this size.");
-         }
+         aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
  
           Temp rsrc = get_gfx6_global_rsrc(bld, addr);
  
@@ -6422,10 +6355,10 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           mubuf->operands[0] = Operand(rsrc);
           mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
           mubuf->operands[2] = Operand(0u);
-         mubuf->operands[3] = Operand(write_data);
+         mubuf->operands[3] = Operand(write_datas[i]);
           mubuf->glc = glc;
           mubuf->dlc = false;
-         mubuf->offset = offset;
+         mubuf->offset = offsets[i];
           mubuf->addr64 = addr.type() == RegType::vgpr;
           mubuf->disable_wqm = true;
           mubuf->barrier = barrier_buffer;
@@ -6621,7 +6554,6 @@ void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
     Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Builder bld(ctx->program, ctx->block);
  
@@ -6636,7 +6568,6 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
     Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
  
     unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
     store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
@@ -6799,63 +6730,23 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
  }
  
  void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
-   assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
     Builder bld(ctx->program, ctx->block);
     Temp rsrc = get_scratch_resource(ctx);
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
  
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   unsigned writemask = nir_intrinsic_write_mask(instr);
-
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      int num_bytes = count * elem_size_bytes;
-
-      if (num_bytes > 16) {
-         assert(elem_size_bytes == 8);
-         writemask |= (((count - 2) << 1) - 1) << (start + 2);
-         count = 2;
-         num_bytes = 16;
-      }
-
-      // TODO: check alignment of sub-dword stores
-      // TODO: split 3 bytes. there is no store instruction for that
-
-      Temp write_data;
-      if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++) {
-            Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
-            vec->operands[i] = Operand(elem);
-         }
-         write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      } else {
-         write_data = data;
-      }
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
  
-      aco_opcode op;
-      switch (num_bytes) {
-         case 4:
-            op = aco_opcode::buffer_store_dword;
-            break;
-         case 8:
-            op = aco_opcode::buffer_store_dwordx2;
-            break;
-         case 12:
-            op = aco_opcode::buffer_store_dwordx3;
-            break;
-         case 16:
-            op = aco_opcode::buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("Invalid data size for nir_intrinsic_store_scratch.");
-      }
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+                      16, &write_count, write_datas, offsets);
  
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
+   for (unsigned i = 0; i < write_count; i++) {
+      aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
     }
  }
  
@@ -7587,11 +7478,11 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_shuffle:
     case nir_intrinsic_read_invocation: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
+      if (!nir_src_is_divergent(instr->src[0])) {
           emit_uniform_subgroup(ctx, instr, src);
        } else {
           Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
-         if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
+         if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
              tid = bld.as_uniform(tid);
           Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
           if (src.regClass() == v1) {
@@ -7697,7 +7588,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           nir_intrinsic_cluster_size(instr) : 0;
        cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
  
-      if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
+      if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
           emit_uniform_subgroup(ctx, instr, src);
        } else if (instr->dest.ssa.bit_size == 1) {
           if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
@@ -7780,7 +7671,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
     case nir_intrinsic_quad_broadcast: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
           emit_uniform_subgroup(ctx, instr, src);
        } else {
           Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -7827,7 +7718,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_quad_swap_diagonal:
     case nir_intrinsic_quad_swizzle_amd: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
           emit_uniform_subgroup(ctx, instr, src);
           break;
        }
@@ -7889,7 +7780,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
     case nir_intrinsic_masked_swizzle_amd: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
           emit_uniform_subgroup(ctx, instr, src);
           break;
        }
@@ -8281,9 +8172,11 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
     bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
-        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
+        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
+        has_clamped_lod = false;
     Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
-        lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
+        lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
+        clamped_lod = Temp();
     std::vector<Temp> coords;
     std::vector<Temp> derivs;
     nir_const_value *sample_index_cv = NULL;
@@ -8305,10 +8198,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           break;
        }
        case nir_tex_src_bias:
-         if (instr->op == nir_texop_txb) {
-            bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
-            has_bias = true;
-         }
+         bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
+         has_bias = true;
           break;
        case nir_tex_src_lod: {
           nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
@@ -8321,6 +8212,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           }
           break;
        }
+      case nir_tex_src_min_lod:
+         clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
+         has_clamped_lod = true;
+         break;
        case nir_tex_src_comparator:
           if (instr->is_shadow) {
              compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
@@ -8361,10 +8256,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
        Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
        Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
-      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
  
+      Operand default_sample = Operand(1u);
+      if (ctx->options->robust_buffer_access) {
+         /* Extract the second dword of the descriptor, if it's
+         * all zero, then it's a null descriptor.
+         */
+         Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
+         Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+         default_sample = Operand(is_non_null_descriptor);
+      }
+
+      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
        bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
-               samples, Operand(1u), bld.scc(is_msaa));
+               samples, default_sample, bld.scc(is_msaa));
        return;
     }
  
@@ -8711,6 +8616,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        args.emplace_back(sample_index);
     if (has_lod)
        args.emplace_back(lod);
+   if (has_clamped_lod)
+      args.emplace_back(clamped_lod);
  
     Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
     aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
@@ -8755,7 +8662,21 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
     // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
     aco_opcode opcode = aco_opcode::image_sample;
     if (has_offset) { /* image_sample_*_o */
-      if (has_compare) {
+      if (has_clamped_lod) {
+         if (has_compare) {
+            opcode = aco_opcode::image_sample_c_cl_o;
+            if (has_derivs)
+               opcode = aco_opcode::image_sample_c_d_cl_o;
+            if (has_bias)
+               opcode = aco_opcode::image_sample_c_b_cl_o;
+         } else {
+            opcode = aco_opcode::image_sample_cl_o;
+            if (has_derivs)
+               opcode = aco_opcode::image_sample_d_cl_o;
+            if (has_bias)
+               opcode = aco_opcode::image_sample_b_cl_o;
+         }
+      } else if (has_compare) {
           opcode = aco_opcode::image_sample_c_o;
           if (has_derivs)
              opcode = aco_opcode::image_sample_c_d_o;
@@ -8776,6 +8697,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           if (has_lod)
              opcode = aco_opcode::image_sample_l_o;
        }
+   } else if (has_clamped_lod) { /* image_sample_*_cl */
+      if (has_compare) {
+         opcode = aco_opcode::image_sample_c_cl;
+         if (has_derivs)
+            opcode = aco_opcode::image_sample_c_d_cl;
+         if (has_bias)
+            opcode = aco_opcode::image_sample_c_b_cl;
+      } else {
+         opcode = aco_opcode::image_sample_cl;
+         if (has_derivs)
+            opcode = aco_opcode::image_sample_d_cl;
+         if (has_bias)
+            opcode = aco_opcode::image_sample_b_cl;
+      }
     } else { /* no offset */
        if (has_compare) {
           opcode = aco_opcode::image_sample_c;
@@ -8874,7 +8809,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
     assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
  
-   bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
+   bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
     logical |= ctx->block->kind & block_kind_merge;
     aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
  
@@ -9568,7 +9503,7 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
     aco_ptr<Pseudo_branch_instruction> branch;
     if_context ic;
  
-   if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
+   if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
        /**
         * Uniform conditionals are represented in the following way*) :
         *
@@ -9596,8 +9531,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
        visit_cf_list(ctx, &if_stmt->else_list);
  
        end_uniform_if(ctx, &ic);
-
-      return !ctx->cf_info.has_branch;
     } else { /* non-uniform condition */
        /**
         * To maintain a logical and linear CFG without critical edges,
@@ -9631,9 +9564,9 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
        visit_cf_list(ctx, &if_stmt->else_list);
  
        end_divergent_if(ctx, &ic);
-
-      return true;
     }
+
+   return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
  }
  
  static bool visit_cf_list(isel_context *ctx,
@@ -9914,6 +9847,7 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
  
     bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
     bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
+   bool is_16bit = values[0].regClass() == v2b;
  
     switch (col_format)
     {
@@ -9944,16 +9878,38 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
     case V_028714_SPI_SHADER_FP16_ABGR:
        enabled_channels = 0x5;
        compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+      if (is_16bit) {
+         if (ctx->options->chip_class >= GFX9) {
+            /* Pack the FP16 values together instead of converting them to
+             * FP32 and back to FP16.
+             * TODO: use p_create_vector and let the compiler optimizes.
+             */
+            compr_op = aco_opcode::v_pack_b32_f16;
+         } else {
+            for (unsigned i = 0; i < 4; i++) {
+               if ((write_mask >> i) & 1)
+                  values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
+            }
+         }
+      }
        break;
  
     case V_028714_SPI_SHADER_UNORM16_ABGR:
        enabled_channels = 0x5;
-      compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+      if (is_16bit && ctx->options->chip_class >= GFX9) {
+         compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
+      } else {
+         compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+      }
        break;
  
     case V_028714_SPI_SHADER_SNORM16_ABGR:
        enabled_channels = 0x5;
-      compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+      if (is_16bit && ctx->options->chip_class >= GFX9) {
+         compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
+      } else {
+         compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+      }
        break;
  
     case V_028714_SPI_SHADER_UINT16_ABGR: {
@@ -9971,6 +9927,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
                                      values[i]);
              }
           }
+      } else if (is_16bit) {
+         for (unsigned i = 0; i < 4; i++) {
+            if ((write_mask >> i) & 1) {
+               Temp tmp = convert_int(bld, values[i].getTemp(), 16, 32, false);
+               values[i] = Operand(tmp);
+            }
+         }
        }
        break;
     }
@@ -9995,6 +9958,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
                                      values[i]);
              }
           }
+      } else if (is_16bit) {
+         for (unsigned i = 0; i < 4; i++) {
+            if ((write_mask >> i) & 1) {
+               Temp tmp = convert_int(bld, values[i].getTemp(), 16, 32, true);
+               values[i] = Operand(tmp);
+            }
+         }
        }
        break;
  
@@ -10785,8 +10755,6 @@ void select_program(Program *program,
        if (ngg_no_gs && !ngg_early_prim_export(&ctx))
           ngg_emit_nogs_output(&ctx);
  
-      ralloc_free(ctx.divergent_vals);
-
        if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
           /* Outputs of the previous stage are inputs to the next stage */
           ctx.inputs = ctx.outputs;