aco: Use context variables instead of calculating TCS inputs/outputs.

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index ca376e5052b5c5f7b8a537f1b13b6fa0004d0cf1..5a1629079174a4159261ce222937f0ab7192534c 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -304,20 +304,21 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
        return;
     if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
        return;
-   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
-   split->operands[0] = Operand(vec_src);
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
     RegClass rc;
     if (num_components > vec_src.size()) {
-      if (vec_src.type() == RegType::sgpr)
+      if (vec_src.type() == RegType::sgpr) {
+         /* should still help get_alu_src() */
+         emit_split_vector(ctx, vec_src, vec_src.size());
           return;
-
+      }
        /* sub-dword split */
-      assert(vec_src.type() == RegType::vgpr);
        rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
     } else {
        rc = RegClass(vec_src.type(), vec_src.size() / num_components);
     }
+   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
+   split->operands[0] = Operand(vec_src);
+   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
     for (unsigned i = 0; i < num_components; i++) {
        elems[i] = {ctx->program->allocateId(), rc};
        split->definitions[i] = Definition(elems[i]);
@@ -501,10 +502,11 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
           return vec;
  
        Temp dst{ctx->program->allocateId(), s1};
-      aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 1)};
+      aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
        bfe->operands[0] = Operand(vec);
        bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
        bfe->definitions[0] = Definition(dst);
+      bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
        ctx->block->instructions.emplace_back(std::move(bfe));
        return dst;
     }
@@ -1015,8 +1017,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  
        if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
-         for (unsigned i = 0; i < num; ++i)
-            vec->operands[i] = Operand{elems[i]};
+         RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
+         for (unsigned i = 0; i < num; ++i) {
+            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
+               vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
+            else
+               vec->operands[i] = Operand{elems[i]};
+         }
           vec->definitions[0] = Definition(dst);
           ctx->block->instructions.emplace_back(std::move(vec));
           ctx->allocated_vec.emplace(dst.id(), elems);
@@ -3606,143 +3613,213 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
     return dst;
  }
  
-Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
+void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
  {
-   if (start == 0 && size == data.size())
-      return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
+   if (!count)
+      return;
  
-   unsigned size_hint = 1;
-   auto it = ctx->allocated_vec.find(data.id());
-   if (it != ctx->allocated_vec.end())
-      size_hint = it->second[0].size();
-   if (size % size_hint || start % size_hint)
-      size_hint = 1;
+   Builder bld(ctx->program, ctx->block);
  
-   start /= size_hint;
-   size /= size_hint;
+   ASSERTED bool is_subdword = false;
+   for (unsigned i = 0; i < count; i++)
+      is_subdword |= offsets[i] % 4;
+   is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
+   assert(!is_subdword || dst_type == RegType::vgpr);
  
-   Temp elems[size];
-   for (unsigned i = 0; i < size; i++)
-      elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
+   /* count == 1 fast path */
+   if (count == 1) {
+      if (dst_type == RegType::sgpr)
+         dst[0] = bld.as_uniform(src);
+      else
+         dst[0] = as_vgpr(ctx, src);
+      return;
+   }
  
-   if (size == 1)
-      return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
+   for (unsigned i = 0; i < count - 1; i++)
+      dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
+   dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
  
-   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
-   for (unsigned i = 0; i < size; i++)
-      vec->operands[i] = Operand(elems[i]);
-   Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
-   vec->definitions[0] = Definition(res);
-   ctx->block->instructions.emplace_back(std::move(vec));
-   return res;
+   if (is_subdword && src.type() == RegType::sgpr) {
+      src = as_vgpr(ctx, src);
+   } else {
+      /* use allocated_vec if possible */
+      auto it = ctx->allocated_vec.find(src.id());
+      if (it != ctx->allocated_vec.end()) {
+         unsigned total_size = 0;
+         for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
+            total_size += it->second[i].bytes();
+         if (total_size != src.bytes())
+            goto split;
+
+         unsigned elem_size = it->second[0].bytes();
+
+         for (unsigned i = 0; i < count; i++) {
+            if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
+               goto split;
+         }
+
+         for (unsigned i = 0; i < count; i++) {
+            unsigned start_idx = offsets[i] / elem_size;
+            unsigned op_count = dst[i].bytes() / elem_size;
+            if (op_count == 1) {
+               if (dst_type == RegType::sgpr)
+                  dst[i] = bld.as_uniform(it->second[start_idx]);
+               else
+                  dst[i] = as_vgpr(ctx, it->second[start_idx]);
+               continue;
+            }
+
+            aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
+            for (unsigned j = 0; j < op_count; j++) {
+               Temp tmp = it->second[start_idx + j];
+               if (dst_type == RegType::sgpr)
+                  tmp = bld.as_uniform(tmp);
+               vec->operands[j] = Operand(tmp);
+            }
+            vec->definitions[0] = Definition(dst[i]);
+            bld.insert(std::move(vec));
+         }
+         return;
+      }
+   }
+
+   if (dst_type == RegType::sgpr)
+      src = bld.as_uniform(src);
+
+   split:
+   /* just split it */
+   aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
+   split->operands[0] = Operand(src);
+   for (unsigned i = 0; i < count; i++)
+      split->definitions[i] = Definition(dst[i]);
+   bld.insert(std::move(split));
+}
+
+bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
+                     int *start, int *count)
+{
+   unsigned start_elem = ffs(todo_mask) - 1;
+   bool skip = !(mask & (1 << start_elem));
+   if (skip)
+      mask = ~mask & todo_mask;
+
+   mask &= todo_mask;
+
+   u_bit_scan_consecutive_range(&mask, start, count);
+
+   return !skip;
  }
  
-void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
+void advance_write_mask(uint32_t *todo_mask, int start, int count)
  {
+   *todo_mask &= ~u_bit_consecutive(0, count) << start;
+}
+
+void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
+               Temp address, unsigned base_offset, unsigned align)
+{
+   assert(util_is_power_of_two_nonzero(align));
+   assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
+
     Builder bld(ctx->program, ctx->block);
-   unsigned bytes_written = 0;
     bool large_ds_write = ctx->options->chip_class >= GFX7;
     bool usable_write2 = ctx->options->chip_class >= GFX7;
  
-   while (bytes_written < total_size * 4) {
-      unsigned todo = total_size * 4 - bytes_written;
-      bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
-      bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   aco_opcode opcodes[32];
+
+   wrmask = widen_mask(wrmask, elem_size_bytes);
+
+   uint32_t todo = u_bit_consecutive(0, data.bytes());
+   while (todo) {
+      int offset, bytes;
+      if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
+         offsets[write_count] = offset;
+         opcodes[write_count] = aco_opcode::num_opcodes;
+         write_count++;
+         advance_write_mask(&todo, offset, bytes);
+         continue;
+      }
+
+      bool aligned2 = offset % 2 == 0 && align % 2 == 0;
+      bool aligned4 = offset % 4 == 0 && align % 4 == 0;
+      bool aligned8 = offset % 8 == 0 && align % 8 == 0;
+      bool aligned16 = offset % 16 == 0 && align % 16 == 0;
  
-      aco_opcode op = aco_opcode::last_opcode;
-      bool write2 = false;
-      unsigned size = 0;
-      if (todo >= 16 && aligned16 && large_ds_write) {
+      //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
+      aco_opcode op = aco_opcode::num_opcodes;
+      if (bytes >= 16 && aligned16 && large_ds_write) {
           op = aco_opcode::ds_write_b128;
-         size = 4;
-      } else if (todo >= 16 && aligned8 && usable_write2) {
-         op = aco_opcode::ds_write2_b64;
-         write2 = true;
-         size = 4;
-      } else if (todo >= 12 && aligned16 && large_ds_write) {
+         bytes = 16;
+      } else if (bytes >= 12 && aligned16 && large_ds_write) {
           op = aco_opcode::ds_write_b96;
-         size = 3;
-      } else if (todo >= 8 && aligned8) {
+         bytes = 12;
+      } else if (bytes >= 8 && aligned8) {
           op = aco_opcode::ds_write_b64;
-         size = 2;
-      } else if (todo >= 8 && usable_write2) {
-         op = aco_opcode::ds_write2_b32;
-         write2 = true;
-         size = 2;
-      } else if (todo >= 4) {
+         bytes = 8;
+      } else if (bytes >= 4 && aligned4) {
           op = aco_opcode::ds_write_b32;
-         size = 1;
+         bytes = 4;
+      } else if (bytes >= 2 && aligned2) {
+         op = aco_opcode::ds_write_b16;
+         bytes = 2;
+      } else if (bytes >= 1) {
+         op = aco_opcode::ds_write_b8;
+         bytes = 1;
        } else {
           assert(false);
        }
  
-      unsigned offset = offset0 + offset1 + bytes_written;
-      unsigned max_offset = write2 ? 1020 : 65535;
-      Temp address_offset = address;
-      if (offset > max_offset) {
-         address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
-         offset = offset1 + bytes_written;
-      }
-      assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
+      offsets[write_count] = offset;
+      opcodes[write_count] = op;
+      write_count++;
+      advance_write_mask(&todo, offset, bytes);
+   }
  
-      if (write2) {
-         Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
-         Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
-         bld.ds(op, address_offset, val0, val1, m, offset / size / 2, (offset / size / 2) + 1);
-      } else {
-         Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
-         bld.ds(op, address_offset, val, m, offset);
-      }
+   Operand m = load_lds_size_m0(bld);
  
-      bytes_written += size * 4;
-   }
-}
+   split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
  
-void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
-               Temp address, unsigned base_offset, unsigned align)
-{
-   assert(util_is_power_of_two_nonzero(align) && align >= 4);
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+   for (unsigned i = 0; i < write_count; i++) {
+      aco_opcode op = opcodes[i];
+      if (op == aco_opcode::num_opcodes)
+         continue;
  
-   Builder bld(ctx->program, ctx->block);
-   Operand m = load_lds_size_m0(bld);
+      Temp data = write_datas[i];
  
-   /* we need at most two stores, assuming that the writemask is at most 4 bits wide */
-   assert(wrmask <= 0x0f);
-   int start[2], count[2];
-   u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
-   u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
-   assert(wrmask == 0);
+      unsigned second = write_count;
+      if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
+         for (second = i + 1; second < write_count; second++) {
+            if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
+               op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
+               opcodes[second] = aco_opcode::num_opcodes;
+               break;
+            }
+         }
+      }
+
+      bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
+      unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
  
-   /* one combined store is sufficient */
-   if (count[0] == count[1] && (align % elem_size_bytes) == 0 && (base_offset % elem_size_bytes) == 0) {
+      unsigned inline_offset = base_offset + offsets[i];
+      unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
        Temp address_offset = address;
-      if ((base_offset / elem_size_bytes) + start[1] > 255) {
+      if (inline_offset > max_offset) {
           address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
-         base_offset = 0;
+         inline_offset = offsets[i];
        }
+      assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
  
-      assert(count[0] == 1);
-      RegClass xtract_rc(RegType::vgpr, elem_size_bytes / 4);
-
-      Temp val0 = emit_extract_vector(ctx, data, start[0], xtract_rc);
-      Temp val1 = emit_extract_vector(ctx, data, start[1], xtract_rc);
-      aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
-      base_offset = base_offset / elem_size_bytes;
-      bld.ds(op, address_offset, val0, val1, m,
-             base_offset + start[0], base_offset + start[1]);
-      return;
-   }
-
-   for (unsigned i = 0; i < 2; i++) {
-      if (count[i] == 0)
-         continue;
-
-      unsigned elem_size_words = elem_size_bytes / 4;
-      ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
-                      base_offset, start[i] * elem_size_bytes, align);
+      if (write2) {
+         Temp second_data = write_datas[second];
+         inline_offset /= data.bytes();
+         bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+      } else {
+         bld.ds(op, address_offset, data, m, inline_offset);
+      }
     }
-   return;
  }
  
  unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
@@ -3755,6 +3832,82 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
  }
  
  
+aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+{
+   switch (bytes) {
+   case 1:
+      assert(!smem);
+      return aco_opcode::buffer_store_byte;
+   case 2:
+      assert(!smem);
+      return aco_opcode::buffer_store_short;
+   case 4:
+      return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+   case 8:
+      return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+   case 12:
+      assert(!smem);
+      return aco_opcode::buffer_store_dwordx3;
+   case 16:
+      return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+   }
+   unreachable("Unexpected store size");
+   return aco_opcode::num_opcodes;
+}
+
+void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
+                        Temp data, unsigned writemask, int swizzle_element_size,
+                        unsigned *write_count, Temp *write_datas, unsigned *offsets)
+{
+   unsigned write_count_with_skips = 0;
+   bool skips[16];
+
+   /* determine how to split the data */
+   unsigned todo = u_bit_consecutive(0, data.bytes());
+   while (todo) {
+      int offset, bytes;
+      skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
+      offsets[write_count_with_skips] = offset;
+      if (skips[write_count_with_skips]) {
+         advance_write_mask(&todo, offset, bytes);
+         write_count_with_skips++;
+         continue;
+      }
+
+      /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
+       * larger than swizzle_element_size */
+      bytes = MIN2(bytes, swizzle_element_size);
+      if (bytes % 4)
+         bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
+
+      /* SMEM and GFX6 VMEM can't emit 12-byte stores */
+      if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
+         bytes = 8;
+
+      /* dword or larger stores have to be dword-aligned */
+      unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
+      unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
+      bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
+      if (bytes >= 4 && !dword_aligned)
+         bytes = MIN2(bytes, 2);
+
+      advance_write_mask(&todo, offset, bytes);
+      write_count_with_skips++;
+   }
+
+   /* actually split data */
+   split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
+
+   /* remove skips */
+   for (unsigned i = 0; i < write_count_with_skips; i++) {
+      if (skips[i])
+         continue;
+      write_datas[*write_count] = write_datas[i];
+      offsets[*write_count] = offsets[i];
+      (*write_count)++;
+   }
+}
+
  Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
                             unsigned split_cnt = 0u, Temp dst = Temp())
  {
@@ -3817,7 +3970,7 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
     assert(vdata.size() >= 1 && vdata.size() <= 4);
  
     Builder bld(ctx->program, ctx->block);
-   aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
+   aco_opcode op = get_buffer_store_op(false, vdata.bytes());
     const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
  
     Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
@@ -3836,35 +3989,17 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
     Builder bld(ctx->program, ctx->block);
     assert(elem_size_bytes == 4 || elem_size_bytes == 8);
     assert(write_mask);
+   write_mask = widen_mask(write_mask, elem_size_bytes);
  
-   if (elem_size_bytes == 8) {
-      elem_size_bytes = 4;
-      write_mask = widen_mask(write_mask, 2);
-   }
-
-   while (write_mask) {
-      int start = 0;
-      int count = 0;
-      u_bit_scan_consecutive_range(&write_mask, &start, &count);
-      assert(count > 0);
-      assert(start >= 0);
-
-      while (count > 0) {
-         unsigned sub_count = allow_combining ? MIN2(count, 4) : 1;
-         unsigned const_offset = (unsigned) start * elem_size_bytes + base_const_offset;
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
+                      allow_combining ? 16 : 4, &write_count, write_datas, offsets);
  
-         /* GFX6 doesn't have buffer_store_dwordx3, so make sure not to emit that here either. */
-         if (unlikely(ctx->program->chip_class == GFX6 && sub_count == 3))
-            sub_count = 2;
-
-         Temp elem = extract_subvector(ctx, src, start, sub_count, RegType::vgpr);
-         emit_single_mubuf_store(ctx, descriptor, voffset, soffset, elem, const_offset, reorder, slc);
-
-         count -= sub_count;
-         start += sub_count;
-      }
-
-      assert(count == 0);
+   for (unsigned i = 0; i < write_count; i++) {
+      unsigned const_offset = offsets[i] + base_const_offset;
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
     }
  }
  
@@ -4018,11 +4153,9 @@ std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intri
     Builder bld(ctx->program, ctx->block);
  
     uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
-   uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
-   uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
-   uint32_t output_vertex_size = num_tcs_outputs * 16;
+   uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
     uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
-   uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+   uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
  
     std::pair<Temp, unsigned> offs = instr
                                      ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
@@ -4070,11 +4203,7 @@ std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx
  {
     Builder bld(ctx->program, ctx->block);
  
-   unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
-                              ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
-                              : ctx->args->options->key.tes.tcs_num_outputs;
-
-   unsigned output_vertex_size = num_tcs_outputs * 16;
+   unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
     unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
     unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
     unsigned attr_stride = ctx->tcs_num_patches;
@@ -4209,9 +4338,8 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
           /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
            * GFX9+: LS is merged into HS, but still uses the same LDS layout.
            */
-         unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
           Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
-         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
+         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
        } else {
           unreachable("Invalid LS or ES stage");
        }
@@ -5524,8 +5652,12 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
     ctx->block->instructions.emplace_back(std::move(load));
  
     Operand sample_index4;
-   if (sample_index.isConstant() && sample_index.constantValue() < 16) {
-      sample_index4 = Operand(sample_index.constantValue() << 2);
+   if (sample_index.isConstant()) {
+      if (sample_index.constantValue() < 16) {
+         sample_index4 = Operand(sample_index.constantValue() << 2);
+      } else {
+         sample_index4 = Operand(0u);
+      }
     } else if (sample_index.regClass() == s1) {
        sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
     } else {
@@ -5997,7 +6129,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Builder bld(ctx->program, ctx->block);
     Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   unsigned writemask = nir_intrinsic_write_mask(instr);
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
     Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
  
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
@@ -6010,124 +6142,47 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
        offset = bld.as_uniform(offset);
     bool smem_nonfs = smem && ctx->stage != fragment_fs;
  
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      if (count == 3 && (smem || ctx->options->chip_class == GFX6)) {
-         /* GFX6 doesn't support storing vec3, split it. */
-         writemask |= 1u << (start + 2);
-         count = 2;
-      }
-      int num_bytes = count * elem_size_bytes;
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
+                      data, writemask, 16, &write_count, write_datas, offsets);
  
-      /* dword or larger stores have to be dword-aligned */
-      if (elem_size_bytes < 4 && num_bytes > 2) {
-         // TODO: improve alignment check of sub-dword stores
-         unsigned count_new = 2 / elem_size_bytes;
-         writemask |= ((1 << (count - count_new)) - 1) << (start + count_new);
-         count = count_new;
-         num_bytes = 2;
-      }
-
-      if (num_bytes > 16) {
-         assert(elem_size_bytes == 8);
-         writemask |= (((count - 2) << 1) - 1) << (start + 2);
-         count = 2;
-         num_bytes = 16;
-      }
-
-      Temp write_data;
-      if (elem_size_bytes < 4) {
-         if (data.type() == RegType::sgpr) {
-            data = as_vgpr(ctx, data);
-            emit_split_vector(ctx, data, 4 * data.size() / elem_size_bytes);
-         }
-         RegClass rc = RegClass(RegType::vgpr, elem_size_bytes).as_subdword();
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++)
-            vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, rc));
-         write_data = bld.tmp(RegClass(RegType::vgpr, num_bytes).as_subdword());
-         vec->definitions[0] = Definition(write_data);
-         bld.insert(std::move(vec));
-      } else if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++) {
-            Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
-            vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
-         }
-         write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      } else if (!smem && data.type() != RegType::vgpr) {
-         assert(num_bytes % 4 == 0);
-         write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
-      } else if (smem_nonfs && data.type() == RegType::vgpr) {
-         assert(num_bytes % 4 == 0);
-         write_data = bld.as_uniform(data);
-      } else {
-         write_data = data;
-      }
-
-      aco_opcode vmem_op, smem_op = aco_opcode::last_opcode;
-      switch (num_bytes) {
-         case 1:
-            vmem_op = aco_opcode::buffer_store_byte;
-            break;
-         case 2:
-            vmem_op = aco_opcode::buffer_store_short;
-            break;
-         case 4:
-            vmem_op = aco_opcode::buffer_store_dword;
-            smem_op = aco_opcode::s_buffer_store_dword;
-            break;
-         case 8:
-            vmem_op = aco_opcode::buffer_store_dwordx2;
-            smem_op = aco_opcode::s_buffer_store_dwordx2;
-            break;
-         case 12:
-            vmem_op = aco_opcode::buffer_store_dwordx3;
-            assert(!smem && ctx->options->chip_class > GFX6);
-            break;
-         case 16:
-            vmem_op = aco_opcode::buffer_store_dwordx4;
-            smem_op = aco_opcode::s_buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("Store SSBO not implemented for this size.");
-      }
-      if (ctx->stage == fragment_fs)
-         smem_op = aco_opcode::p_fs_buffer_store_smem;
+   for (unsigned i = 0; i < write_count; i++) {
+      aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
+      if (smem && ctx->stage == fragment_fs)
+         op = aco_opcode::p_fs_buffer_store_smem;
  
        if (smem) {
-         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
+         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
           store->operands[0] = Operand(rsrc);
-         if (start) {
+         if (offsets[i]) {
              Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                offset, Operand(start * elem_size_bytes));
+                                offset, Operand(offsets[i]));
              store->operands[1] = Operand(off);
           } else {
              store->operands[1] = Operand(offset);
           }
-         if (smem_op != aco_opcode::p_fs_buffer_store_smem)
+         if (op != aco_opcode::p_fs_buffer_store_smem)
              store->operands[1].setFixed(m0);
-         store->operands[2] = Operand(write_data);
+         store->operands[2] = Operand(write_datas[i]);
           store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
           store->dlc = false;
           store->disable_wqm = true;
           store->barrier = barrier_buffer;
           ctx->block->instructions.emplace_back(std::move(store));
           ctx->program->wb_smem_l1_on_end = true;
-         if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
+         if (op == aco_opcode::p_fs_buffer_store_smem) {
              ctx->block->kind |= block_kind_needs_lowering;
              ctx->program->needs_exact = true;
           }
        } else {
-         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
+         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
           store->operands[0] = Operand(rsrc);
           store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
           store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
-         store->operands[3] = Operand(write_data);
-         store->offset = start * elem_size_bytes;
+         store->operands[3] = Operand(write_datas[i]);
+         store->offset = offsets[i];
           store->offen = (offset.type() == RegType::vgpr);
           store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
           store->dlc = false;
@@ -6265,38 +6320,25 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
  
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
  
     if (ctx->options->chip_class >= GFX7)
        addr = as_vgpr(ctx, addr);
  
-   unsigned writemask = nir_intrinsic_write_mask(instr);
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      if (count == 3 && ctx->options->chip_class == GFX6) {
-         /* GFX6 doesn't support storing vec3, split it. */
-         writemask |= 1u << (start + 2);
-         count = 2;
-      }
-      unsigned num_bytes = count * elem_size_bytes;
-
-      Temp write_data = data;
-      if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++)
-            vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
-         write_data = bld.tmp(RegType::vgpr, count);
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      }
-
-      bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
-      unsigned offset = start * elem_size_bytes;
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+                      16, &write_count, write_datas, offsets);
  
+   for (unsigned i = 0; i < write_count; i++) {
        if (ctx->options->chip_class >= GFX7) {
+         unsigned offset = offsets[i];
+         Temp store_addr = addr;
           if (offset > 0 && ctx->options->chip_class < GFX9) {
              Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
              Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
@@ -6309,14 +6351,20 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
                       Operand(0u), addr1,
                       carry).def(1).setHint(vcc);
  
-            addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
+            store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
  
              offset = 0;
           }
  
           bool global = ctx->options->chip_class >= GFX9;
           aco_opcode op;
-         switch (num_bytes) {
+         switch (write_datas[i].bytes()) {
+         case 1:
+            op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
+            break;
+         case 2:
+            op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
+            break;
           case 4:
              op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
              break;
@@ -6334,9 +6382,9 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           }
  
           aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
-         flat->operands[0] = Operand(addr);
+         flat->operands[0] = Operand(store_addr);
           flat->operands[1] = Operand(s1);
-         flat->operands[2] = Operand(data);
+         flat->operands[2] = Operand(write_datas[i]);
           flat->glc = glc;
           flat->dlc = false;
           flat->offset = offset;
@@ -6347,20 +6395,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
        } else {
           assert(ctx->options->chip_class == GFX6);
  
-         aco_opcode op;
-         switch (num_bytes) {
-         case 4:
-            op = aco_opcode::buffer_store_dword;
-            break;
-         case 8:
-            op = aco_opcode::buffer_store_dwordx2;
-            break;
-         case 16:
-            op = aco_opcode::buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("store_global not implemented for this size.");
-         }
+         aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
  
           Temp rsrc = get_gfx6_global_rsrc(bld, addr);
  
@@ -6368,10 +6403,10 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           mubuf->operands[0] = Operand(rsrc);
           mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
           mubuf->operands[2] = Operand(0u);
-         mubuf->operands[3] = Operand(write_data);
+         mubuf->operands[3] = Operand(write_datas[i]);
           mubuf->glc = glc;
           mubuf->dlc = false;
-         mubuf->offset = offset;
+         mubuf->offset = offsets[i];
           mubuf->addr64 = addr.type() == RegType::vgpr;
           mubuf->disable_wqm = true;
           mubuf->barrier = barrier_buffer;
@@ -6567,7 +6602,6 @@ void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
     Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Builder bld(ctx->program, ctx->block);
  
@@ -6582,7 +6616,6 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
     Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
  
     unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
     store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
@@ -6745,63 +6778,23 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
  }
  
  void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
-   assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
     Builder bld(ctx->program, ctx->block);
     Temp rsrc = get_scratch_resource(ctx);
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
  
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   unsigned writemask = nir_intrinsic_write_mask(instr);
-
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      int num_bytes = count * elem_size_bytes;
-
-      if (num_bytes > 16) {
-         assert(elem_size_bytes == 8);
-         writemask |= (((count - 2) << 1) - 1) << (start + 2);
-         count = 2;
-         num_bytes = 16;
-      }
-
-      // TODO: check alignment of sub-dword stores
-      // TODO: split 3 bytes. there is no store instruction for that
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
  
-      Temp write_data;
-      if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++) {
-            Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
-            vec->operands[i] = Operand(elem);
-         }
-         write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      } else {
-         write_data = data;
-      }
-
-      aco_opcode op;
-      switch (num_bytes) {
-         case 4:
-            op = aco_opcode::buffer_store_dword;
-            break;
-         case 8:
-            op = aco_opcode::buffer_store_dwordx2;
-            break;
-         case 12:
-            op = aco_opcode::buffer_store_dwordx3;
-            break;
-         case 16:
-            op = aco_opcode::buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("Invalid data size for nir_intrinsic_store_scratch.");
-      }
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+                      16, &write_count, write_datas, offsets);
  
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
+   for (unsigned i = 0; i < write_count; i++) {
+      aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
     }
  }
  
@@ -8307,10 +8300,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
        Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
        Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
-      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
  
+      Operand default_sample = Operand(1u);
+      if (ctx->options->robust_buffer_access) {
+         /* Extract the second dword of the descriptor, if it's
+         * all zero, then it's a null descriptor.
+         */
+         Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
+         Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+         default_sample = Operand(is_non_null_descriptor);
+      }
+
+      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
        bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
-               samples, Operand(1u), bld.scc(is_msaa));
+               samples, default_sample, bld.scc(is_msaa));
        return;
     }
  
@@ -9542,8 +9545,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
        visit_cf_list(ctx, &if_stmt->else_list);
  
        end_uniform_if(ctx, &ic);
-
-      return !ctx->cf_info.has_branch;
     } else { /* non-uniform condition */
        /**
         * To maintain a logical and linear CFG without critical edges,
@@ -9577,9 +9578,9 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
        visit_cf_list(ctx, &if_stmt->else_list);
  
        end_divergent_if(ctx, &ic);
-
-      return true;
     }
+
+   return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
  }
  
  static bool visit_cf_list(isel_context *ctx,