radeonsi: fix tess levels coming as scalar arrays from SPIR-V
[mesa.git] / src / gallium / drivers / radeonsi / si_shader_llvm_tess.c
index 549c4bf6b11e0a43e093d952ada47ccf5ad39e0f..13bed5f2569757d316b15c0af904daddbde64145 100644 (file)
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "si_shader_internal.h"
 #include "si_pipe.h"
+#include "si_shader_internal.h"
 #include "sid.h"
 
 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 {
-       switch (ctx->type) {
-       case PIPE_SHADER_TESS_CTRL:
-               return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
+   switch (ctx->type) {
+   case PIPE_SHADER_TESS_CTRL:
+      return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
 
-       case PIPE_SHADER_TESS_EVAL:
-               return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
+   case PIPE_SHADER_TESS_EVAL:
+      return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
 
-       default:
-               assert(0);
-               return NULL;
-       }
+   default:
+      assert(0);
+      return NULL;
+   }
 }
 
 /* Tessellation shaders pass outputs to the next shader using LDS.
@@ -62,151 +62,134 @@ static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
  * All three shaders VS(LS), TCS, TES share the same LDS space.
  */
 
-static LLVMValueRef
-get_tcs_in_patch_stride(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_patch_stride(struct si_shader_context *ctx)
 {
-       return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
+   return si_unpack_param(ctx, ctx->vs_state_bits, 11, 13);
 }
 
 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
 {
-       assert(ctx->type == PIPE_SHADER_TESS_CTRL);
+   assert(ctx->type == PIPE_SHADER_TESS_CTRL);
 
-       if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
-               return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
+   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+      return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
 
-       return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
+   return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
 }
 
 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
 {
-       unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+   unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 
-       return LLVMConstInt(ctx->i32, stride, 0);
+   return LLVMConstInt(ctx->ac.i32, stride, 0);
 }
 
 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
 {
-       if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
-               return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
-
-       const struct si_shader_info *info = &ctx->shader->selector->info;
-       unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
-       unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
-       unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
-       unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
-                                  num_patch_outputs * 4;
-       return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
+   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+      return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
+
+   const struct si_shader_info *info = &ctx->shader->selector->info;
+   unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+   unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
+   unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
+   unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride + num_patch_outputs * 4;
+   return LLVMConstInt(ctx->ac.i32, patch_dw_stride, 0);
 }
 
-static LLVMValueRef
-get_tcs_out_patch0_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 {
-       return LLVMBuildMul(ctx->ac.builder,
-                           si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
-                           LLVMConstInt(ctx->i32, 4, 0), "");
+   return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
+                       LLVMConstInt(ctx->ac.i32, 4, 0), "");
 }
 
-static LLVMValueRef
-get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 {
-       return LLVMBuildMul(ctx->ac.builder,
-                           si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
-                           LLVMConstInt(ctx->i32, 4, 0), "");
+   return LLVMBuildMul(ctx->ac.builder, si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
+                       LLVMConstInt(ctx->ac.i32, 4, 0), "");
 }
 
-static LLVMValueRef
-get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 {
-       LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
-       LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-       return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
+   return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
 }
 
-static LLVMValueRef
-get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 {
-       LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
-       LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
-       LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
+   LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-       return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
+   return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
 }
 
-static LLVMValueRef
-get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
+static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 {
-       LLVMValueRef patch0_patch_data_offset =
-               get_tcs_out_patch0_patch_data_offset(ctx);
-       LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
-       LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
+   LLVMValueRef patch0_patch_data_offset = get_tcs_out_patch0_patch_data_offset(ctx);
+   LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
+   LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 
-       return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
+   return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
 }
 
 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
 {
-       unsigned tcs_out_vertices =
-               ctx->shader->selector ?
-               ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
+   unsigned tcs_out_vertices =
+      ctx->shader->selector ? ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]
+                            : 0;
 
-       /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
-       if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
-               return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
+   /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
+   if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
+      return LLVMConstInt(ctx->ac.i32, tcs_out_vertices, 0);
 
-       return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
+   return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
 }
 
 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 {
-       unsigned stride;
-
-       switch (ctx->type) {
-       case PIPE_SHADER_VERTEX:
-               stride = ctx->shader->selector->lshs_vertex_stride / 4;
-               return LLVMConstInt(ctx->i32, stride, 0);
-
-       case PIPE_SHADER_TESS_CTRL:
-               if (ctx->screen->info.chip_class >= GFX9 &&
-                   ctx->shader->is_monolithic) {
-                       stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
-                       return LLVMConstInt(ctx->i32, stride, 0);
-               }
-               return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
-
-       default:
-               assert(0);
-               return NULL;
-       }
+   unsigned stride;
+
+   switch (ctx->type) {
+   case PIPE_SHADER_VERTEX:
+      stride = ctx->shader->selector->lshs_vertex_stride / 4;
+      return LLVMConstInt(ctx->ac.i32, stride, 0);
+
+   case PIPE_SHADER_TESS_CTRL:
+      if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) {
+         stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+         return LLVMConstInt(ctx->ac.i32, stride, 0);
+      }
+      return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
+
+   default:
+      assert(0);
+      return NULL;
+   }
 }
 
-static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
-                                                       LLVMValueRef vertex_dw_stride,
-                                                       LLVMValueRef base_addr,
-                                                       LLVMValueRef vertex_index,
-                                                       LLVMValueRef param_index,
-                                                       ubyte name, ubyte index)
+static LLVMValueRef
+get_dw_address_from_generic_indices(struct si_shader_context *ctx, LLVMValueRef vertex_dw_stride,
+                                    LLVMValueRef base_addr, LLVMValueRef vertex_index,
+                                    LLVMValueRef param_index, ubyte name, ubyte index)
 {
-       if (vertex_dw_stride) {
-               base_addr = ac_build_imad(&ctx->ac, vertex_index,
-                                         vertex_dw_stride, base_addr);
-       }
-
-       if (param_index) {
-               base_addr = ac_build_imad(&ctx->ac, param_index,
-                                         LLVMConstInt(ctx->i32, 4, 0), base_addr);
-       }
-
-       int param = name == TGSI_SEMANTIC_PATCH ||
-                   name == TGSI_SEMANTIC_TESSINNER ||
-                   name == TGSI_SEMANTIC_TESSOUTER ?
-               si_shader_io_get_unique_index_patch(name, index) :
-               si_shader_io_get_unique_index(name, index, false);
-
-       /* Add the base address of the element. */
-       return LLVMBuildAdd(ctx->ac.builder, base_addr,
-                           LLVMConstInt(ctx->i32, param * 4, 0), "");
+   if (vertex_dw_stride) {
+      base_addr = ac_build_imad(&ctx->ac, vertex_index, vertex_dw_stride, base_addr);
+   }
+
+   if (param_index) {
+      base_addr = ac_build_imad(&ctx->ac, param_index, LLVMConstInt(ctx->ac.i32, 4, 0), base_addr);
+   }
+
+   int param = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+                     name == TGSI_SEMANTIC_TESSOUTER
+                  ? si_shader_io_get_unique_index_patch(name, index)
+                  : si_shader_io_get_unique_index(name, index, false);
+
+   /* Add the base address of the element. */
+   return LLVMBuildAdd(ctx->ac.builder, base_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
 }
 
 /* The offchip buffer layout for TCS->TES is
@@ -228,98 +211,88 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context
  * Note that every attribute has 4 components.
  */
 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
-                                              LLVMValueRef rel_patch_id,
-                                               LLVMValueRef vertex_index,
+                                               LLVMValueRef rel_patch_id, LLVMValueRef vertex_index,
                                                LLVMValueRef param_index)
 {
-       LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
-       LLVMValueRef param_stride, constant16;
-
-       vertices_per_patch = get_num_tcs_out_vertices(ctx);
-       num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
-       total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
-                                     num_patches, "");
-
-       constant16 = LLVMConstInt(ctx->i32, 16, 0);
-       if (vertex_index) {
-               base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
-                                         vertices_per_patch, vertex_index);
-               param_stride = total_vertices;
-       } else {
-               base_addr = rel_patch_id;
-               param_stride = num_patches;
-       }
-
-       base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
-       base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
-
-       if (!vertex_index) {
-               LLVMValueRef patch_data_offset =
-                          si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
-
-               base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
-                                        patch_data_offset, "");
-       }
-       return base_addr;
+   LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
+   LLVMValueRef param_stride, constant16;
+
+   vertices_per_patch = get_num_tcs_out_vertices(ctx);
+   num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
+   total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch, num_patches, "");
+
+   constant16 = LLVMConstInt(ctx->ac.i32, 16, 0);
+   if (vertex_index) {
+      base_addr = ac_build_imad(&ctx->ac, rel_patch_id, vertices_per_patch, vertex_index);
+      param_stride = total_vertices;
+   } else {
+      base_addr = rel_patch_id;
+      param_stride = num_patches;
+   }
+
+   base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
+   base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
+
+   if (!vertex_index) {
+      LLVMValueRef patch_data_offset = si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
+
+      base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, patch_data_offset, "");
+   }
+   return base_addr;
 }
 
-static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
-                                       struct si_shader_context *ctx,
-                                       LLVMValueRef vertex_index,
-                                       LLVMValueRef param_index,
-                                       ubyte name, ubyte index)
+static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(struct si_shader_context *ctx,
+                                                                    LLVMValueRef vertex_index,
+                                                                    LLVMValueRef param_index,
+                                                                    ubyte name, ubyte index)
 {
-       unsigned param_index_base;
-
-       param_index_base = name == TGSI_SEMANTIC_PATCH ||
-                          name == TGSI_SEMANTIC_TESSINNER ||
-                          name == TGSI_SEMANTIC_TESSOUTER ?
-               si_shader_io_get_unique_index_patch(name, index) :
-               si_shader_io_get_unique_index(name, index, false);
-
-       if (param_index) {
-               param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
-                                          LLVMConstInt(ctx->i32, param_index_base, 0),
-                                          "");
-       } else {
-               param_index = LLVMConstInt(ctx->i32, param_index_base, 0);
-       }
-
-       return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
-                                         vertex_index, param_index);
+   unsigned param_index_base;
+
+   param_index_base = name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+                            name == TGSI_SEMANTIC_TESSOUTER
+                         ? si_shader_io_get_unique_index_patch(name, index)
+                         : si_shader_io_get_unique_index(name, index, false);
+
+   if (param_index) {
+      param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
+                                 LLVMConstInt(ctx->ac.i32, param_index_base, 0), "");
+   } else {
+      param_index = LLVMConstInt(ctx->ac.i32, param_index_base, 0);
+   }
+
+   return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), vertex_index, param_index);
 }
 
-static LLVMValueRef buffer_load(struct si_shader_context *ctx,
-                                LLVMTypeRef type, unsigned swizzle,
-                                LLVMValueRef buffer, LLVMValueRef offset,
-                                LLVMValueRef base, bool can_speculate)
+static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+                                LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base,
+                                bool can_speculate)
 {
-       LLVMValueRef value, value2;
-       LLVMTypeRef vec_type = LLVMVectorType(type, 4);
+   LLVMValueRef value, value2;
+   LLVMTypeRef vec_type = LLVMVectorType(type, 4);
 
-       if (swizzle == ~0) {
-               value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-                                            0, ac_glc, can_speculate, false);
+   if (swizzle == ~0) {
+      value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+                                   can_speculate, false);
 
-               return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
-       }
+      return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+   }
 
-       if (ac_get_type_size(type) != 64) {
-               value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-                                            0, ac_glc, can_speculate, false);
+   if (ac_get_type_size(type) != 8) {
+      value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc,
+                                   can_speculate, false);
 
-               value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
-               return LLVMBuildExtractElement(ctx->ac.builder, value,
-                                   LLVMConstInt(ctx->i32, swizzle, 0), "");
-       }
+      value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
+      return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0),
+                                     "");
+   }
 
-       value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                 swizzle * 4, ac_glc, can_speculate, false);
+   value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4, ac_glc,
+                                can_speculate, false);
 
-       value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                  swizzle * 4 + 4, ac_glc, can_speculate, false);
+   value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4 + 4, ac_glc,
+                                 can_speculate, false);
 
-       return si_build_gather_64bit(ctx, type, value, value2);
+   return si_build_gather_64bit(ctx, type, value, value2);
 }
 
 /**
@@ -329,36 +302,34 @@ static LLVMValueRef buffer_load(struct si_shader_context *ctx,
  * \param swizzle      offset (typically 0..3); it can be ~0, which loads a vec4
  * \param dw_addr      address in dwords
  */
-static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
-                                 LLVMTypeRef type, unsigned swizzle,
-                                 LLVMValueRef dw_addr)
+static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef type, unsigned swizzle,
+                                  LLVMValueRef dw_addr)
 {
-       LLVMValueRef value;
+   LLVMValueRef value;
 
-       if (swizzle == ~0) {
-               LLVMValueRef values[4];
+   if (swizzle == ~0) {
+      LLVMValueRef values[4];
 
-               for (unsigned chan = 0; chan < 4; chan++)
-                       values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
+      for (unsigned chan = 0; chan < 4; chan++)
+         values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
 
-               return ac_build_gather_values(&ctx->ac, values, 4);
-       }
+      return ac_build_gather_values(&ctx->ac, values, 4);
+   }
 
-       /* Split 64-bit loads. */
-       if (ac_get_type_size(type) == 64) {
-               LLVMValueRef lo, hi;
+   /* Split 64-bit loads. */
+   if (ac_get_type_size(type) == 8) {
+      LLVMValueRef lo, hi;
 
-               lo = lshs_lds_load(ctx, ctx->i32, swizzle, dw_addr);
-               hi = lshs_lds_load(ctx, ctx->i32, swizzle + 1, dw_addr);
-               return si_build_gather_64bit(ctx, type, lo, hi);
-       }
+      lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr);
+      hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr);
+      return si_build_gather_64bit(ctx, type, lo, hi);
+   }
 
-       dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
-                              LLVMConstInt(ctx->i32, swizzle, 0), "");
+   dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), "");
 
-       value = ac_lds_load(&ctx->ac, dw_addr);
+   value = ac_lds_load(&ctx->ac, dw_addr);
 
-       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
 }
 
 /**
@@ -368,421 +339,373 @@ static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
  * \param dw_addr      address in dwords
  * \param value                value to store
  */
-static void lshs_lds_store(struct si_shader_context *ctx,
-                     unsigned dw_offset_imm, LLVMValueRef dw_addr,
-                     LLVMValueRef value)
+static void lshs_lds_store(struct si_shader_context *ctx, unsigned dw_offset_imm,
+                           LLVMValueRef dw_addr, LLVMValueRef value)
 {
-       dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
-                              LLVMConstInt(ctx->i32, dw_offset_imm, 0), "");
+   dw_addr =
+      LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, dw_offset_imm, 0), "");
 
-       ac_lds_store(&ctx->ac, dw_addr, value);
+   ac_lds_store(&ctx->ac, dw_addr, value);
 }
 
-enum si_tess_ring {
-       TCS_FACTOR_RING,
-       TESS_OFFCHIP_RING_TCS,
-       TESS_OFFCHIP_RING_TES,
+enum si_tess_ring
+{
+   TCS_FACTOR_RING,
+   TESS_OFFCHIP_RING_TCS,
+   TESS_OFFCHIP_RING_TES,
 };
 
-static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
-                                            enum si_tess_ring ring)
+static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx, enum si_tess_ring ring)
 {
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef addr = ac_get_arg(&ctx->ac,
-                                      ring == TESS_OFFCHIP_RING_TES ?
-                                      ctx->tes_offchip_addr :
-                                      ctx->tcs_out_lds_layout);
-
-       /* TCS only receives high 13 bits of the address. */
-       if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
-               addr = LLVMBuildAnd(builder, addr,
-                                   LLVMConstInt(ctx->i32, 0xfff80000, 0), "");
-       }
-
-       if (ring == TCS_FACTOR_RING) {
-               unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
-               addr = LLVMBuildAdd(builder, addr,
-                                   LLVMConstInt(ctx->i32, tf_offset, 0), "");
-       }
-
-       uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
-       if (ctx->screen->info.chip_class >= GFX10)
-               rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                        S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
-                        S_008F0C_RESOURCE_LEVEL(1);
-       else
-               rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
-       LLVMValueRef desc[4];
-       desc[0] = addr;
-       desc[1] = LLVMConstInt(ctx->i32,
-                              S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
-       desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0);
-       desc[3] = LLVMConstInt(ctx->i32, rsrc3, false);
-
-       return ac_build_gather_values(&ctx->ac, desc, 4);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef addr = ac_get_arg(
+      &ctx->ac, ring == TESS_OFFCHIP_RING_TES ? ctx->tes_offchip_addr : ctx->tcs_out_lds_layout);
+
+   /* TCS only receives high 13 bits of the address. */
+   if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
+      addr = LLVMBuildAnd(builder, addr, LLVMConstInt(ctx->ac.i32, 0xfff80000, 0), "");
+   }
+
+   if (ring == TCS_FACTOR_RING) {
+      unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
+      addr = LLVMBuildAdd(builder, addr, LLVMConstInt(ctx->ac.i32, tf_offset, 0), "");
+   }
+
+   uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                    S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+
+   if (ctx->screen->info.chip_class >= GFX10)
+      rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+               S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
+   else
+      rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+   LLVMValueRef desc[4];
+   desc[0] = addr;
+   desc[1] = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
+   desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
+   desc[3] = LLVMConstInt(ctx->ac.i32, rsrc3, false);
+
+   return ac_build_gather_values(&ctx->ac, desc, 4);
 }
 
 void si_llvm_preload_tes_rings(struct si_shader_context *ctx)
 {
-       ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
+   ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
 }
 
-static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
-                                            LLVMTypeRef type,
-                                            LLVMValueRef vertex_index,
-                                            LLVMValueRef param_index,
-                                            unsigned const_index,
-                                            unsigned location,
-                                            unsigned driver_location,
-                                            unsigned component,
-                                            unsigned num_components,
-                                            bool is_patch,
-                                            bool is_compact,
-                                            bool load_input)
+static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMTypeRef type,
+                                             LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                             unsigned const_index, unsigned location,
+                                             unsigned driver_location, unsigned component,
+                                             unsigned num_components, bool is_patch,
+                                             bool is_compact, bool load_input)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       LLVMValueRef dw_addr, stride;
-       ubyte name, index;
-
-       driver_location = driver_location / 4;
-
-       if (load_input) {
-               name = info->input_semantic_name[driver_location];
-               index = info->input_semantic_index[driver_location];
-       } else {
-               name = info->output_semantic_name[driver_location];
-               index = info->output_semantic_index[driver_location];
-       }
-
-       assert((name == TGSI_SEMANTIC_PATCH ||
-               name == TGSI_SEMANTIC_TESSINNER ||
-               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
-       if (load_input) {
-               stride = get_tcs_in_vertex_dw_stride(ctx);
-               dw_addr = get_tcs_in_current_patch_offset(ctx);
-       } else {
-               if (is_patch) {
-                       stride = NULL;
-                       dw_addr = get_tcs_out_current_patch_data_offset(ctx);
-               } else {
-                       stride = get_tcs_out_vertex_dw_stride(ctx);
-                       dw_addr = get_tcs_out_current_patch_offset(ctx);
-               }
-       }
-
-       if (!param_index) {
-               param_index = LLVMConstInt(ctx->i32, const_index, 0);
-       }
-
-       dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
-                                                     vertex_index, param_index,
-                                                     name, index);
-
-       LLVMValueRef value[4];
-       for (unsigned i = 0; i < num_components; i++) {
-               unsigned offset = i;
-               if (ac_get_type_size(type) == 64)
-                       offset *= 2;
-
-               offset += component;
-               value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
-       }
-
-       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef dw_addr, stride;
+   ubyte name, index;
+
+   driver_location = driver_location / 4;
+
+   if (load_input) {
+      name = info->input_semantic_name[driver_location];
+      index = info->input_semantic_index[driver_location];
+   } else {
+      name = info->output_semantic_name[driver_location];
+      index = info->output_semantic_index[driver_location];
+   }
+
+   assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+           name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+   if (load_input) {
+      stride = get_tcs_in_vertex_dw_stride(ctx);
+      dw_addr = get_tcs_in_current_patch_offset(ctx);
+   } else {
+      if (is_patch) {
+         stride = NULL;
+         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+      } else {
+         stride = get_tcs_out_vertex_dw_stride(ctx);
+         dw_addr = get_tcs_out_current_patch_offset(ctx);
+      }
+   }
+
+   if (!param_index) {
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+   }
+
+   dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+                                                 name, index);
+
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8)
+         offset *= 2;
+
+      offset += component;
+      value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
+   }
+
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 }
 
-LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
-                                  LLVMTypeRef type,
-                                  LLVMValueRef vertex_index,
-                                  LLVMValueRef param_index,
-                                  unsigned const_index,
-                                  unsigned location,
-                                  unsigned driver_location,
-                                  unsigned component,
-                                  unsigned num_components,
-                                  bool is_patch,
-                                  bool is_compact,
-                                  bool load_input)
+static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef type,
+                                          LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                          unsigned const_index, unsigned location,
+                                          unsigned driver_location, unsigned component,
+                                          unsigned num_components, bool is_patch, bool is_compact,
+                                          bool load_input)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       LLVMValueRef base, addr;
-
-       driver_location = driver_location / 4;
-       ubyte name = info->input_semantic_name[driver_location];
-       ubyte index = info->input_semantic_index[driver_location];
-
-       assert((name == TGSI_SEMANTIC_PATCH ||
-               name == TGSI_SEMANTIC_TESSINNER ||
-               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
-       base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-       if (!param_index) {
-               param_index = LLVMConstInt(ctx->i32, const_index, 0);
-       }
-
-       addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-                                                              param_index,
-                                                              name, index);
-
-       /* TODO: This will generate rather ordinary llvm code, although it
-        * should be easy for the optimiser to fix up. In future we might want
-        * to refactor buffer_load().
-        */
-       LLVMValueRef value[4];
-       for (unsigned i = 0; i < num_components; i++) {
-               unsigned offset = i;
-               if (ac_get_type_size(type) == 64) {
-                       offset *= 2;
-                       if (offset == 4) {
-                               ubyte name = info->input_semantic_name[driver_location + 1];
-                               ubyte index = info->input_semantic_index[driver_location + 1];
-                                addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
-                                                                                       vertex_index,
-                                                                                       param_index,
-                                                                                      name, index);
-                       }
-
-                        offset = offset % 4;
-               }
-
-               offset += component;
-               value[i + component] = buffer_load(ctx, type, offset,
-                                                  ctx->tess_offchip_ring, base, addr, true);
-       }
-
-       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   LLVMValueRef base, addr;
+
+   driver_location = driver_location / 4;
+   ubyte name = info->input_semantic_name[driver_location];
+   ubyte index = info->input_semantic_index[driver_location];
+
+   assert((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+           name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+   if (!param_index) {
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+   }
+
+   addr =
+      get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+   /* TODO: This will generate rather ordinary llvm code, although it
+    * should be easy for the optimiser to fix up. In future we might want
+    * to refactor buffer_load().
+    */
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8) {
+         offset *= 2;
+         if (offset == 4) {
+            ubyte name = info->input_semantic_name[driver_location + 1];
+            ubyte index = info->input_semantic_index[driver_location + 1];
+            addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+                                                                   name, index);
+         }
+
+         offset = offset % 4;
+      }
+
+      offset += component;
+      value[i + component] =
+         buffer_load(ctx, type, offset, ctx->tess_offchip_ring, base, addr, true);
+   }
+
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 }
 
-static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
-                                   const struct nir_variable *var,
-                                   LLVMValueRef vertex_index,
-                                   LLVMValueRef param_index,
-                                   unsigned const_index,
-                                   LLVMValueRef src,
-                                   unsigned writemask)
+static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_variable *var,
+                                    LLVMValueRef vertex_index, LLVMValueRef param_index,
+                                    unsigned const_index, LLVMValueRef src, unsigned writemask)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       const unsigned component = var->data.location_frac;
-       unsigned driver_location = var->data.driver_location;
-       LLVMValueRef dw_addr, stride;
-       LLVMValueRef buffer, base, addr;
-       LLVMValueRef values[8];
-       bool skip_lds_store;
-       bool is_tess_factor = false, is_tess_inner = false;
-
-       driver_location = driver_location / 4;
-       ubyte name = info->output_semantic_name[driver_location];
-       ubyte index = info->output_semantic_index[driver_location];
-
-       bool is_const = !param_index;
-       if (!param_index)
-               param_index = LLVMConstInt(ctx->i32, const_index, 0);
-
-       const bool is_patch = var->data.patch ||
-                             var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
-                             var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
-
-       assert((name == TGSI_SEMANTIC_PATCH ||
-               name == TGSI_SEMANTIC_TESSINNER ||
-               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
-
-       if (!is_patch) {
-               stride = get_tcs_out_vertex_dw_stride(ctx);
-               dw_addr = get_tcs_out_current_patch_offset(ctx);
-               dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
-                                                             vertex_index, param_index,
-                                                             name, index);
-
-               skip_lds_store = !info->reads_pervertex_outputs;
-       } else {
-               dw_addr = get_tcs_out_current_patch_data_offset(ctx);
-               dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
-                                                             vertex_index, param_index,
-                                                             name, index);
-
-               skip_lds_store = !info->reads_perpatch_outputs;
-
-               if (is_const && const_index == 0) {
-                       int name = info->output_semantic_name[driver_location];
-
-                       /* Always write tess factors into LDS for the TCS epilog. */
-                       if (name == TGSI_SEMANTIC_TESSINNER ||
-                           name == TGSI_SEMANTIC_TESSOUTER) {
-                               /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
-                               skip_lds_store = !info->reads_tessfactor_outputs &&
-                                                ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
-                               is_tess_factor = true;
-                               is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
-                       }
-               }
-       }
-
-       buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-
-       base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-       addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-                                                              param_index, name, index);
-
-       for (unsigned chan = component; chan < 8; chan++) {
-               if (!(writemask & (1 << chan)))
-                       continue;
-               LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
-
-               unsigned buffer_store_offset = chan % 4;
-               if (chan == 4) {
-                       ubyte name = info->output_semantic_name[driver_location + 1];
-                       ubyte index = info->output_semantic_index[driver_location + 1];
-                        addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
-                                                                               vertex_index,
-                                                                               param_index,
-                                                                              name, index);
-               }
-
-               /* Skip LDS stores if there is no LDS read of this output. */
-               if (!skip_lds_store)
-                       lshs_lds_store(ctx, chan, dw_addr, value);
-
-               value = ac_to_integer(&ctx->ac, value);
-               values[chan] = value;
-
-               if (writemask != 0xF && !is_tess_factor) {
-                       ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
-                                                   addr, base,
-                                                   4 * buffer_store_offset,
-                                                    ac_glc);
-               }
-
-               /* Write tess factors into VGPRs for the epilog. */
-               if (is_tess_factor &&
-                   ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
-                       if (!is_tess_inner) {
-                               LLVMBuildStore(ctx->ac.builder, value, /* outer */
-                                              ctx->invoc0_tess_factors[chan]);
-                       } else if (chan < 2) {
-                               LLVMBuildStore(ctx->ac.builder, value, /* inner */
-                                              ctx->invoc0_tess_factors[4 + chan]);
-                       }
-               }
-       }
-
-       if (writemask == 0xF && !is_tess_factor) {
-               LLVMValueRef value = ac_build_gather_values(&ctx->ac,
-                                                           values, 4);
-               ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
-                                           base, 0, ac_glc);
-       }
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   unsigned component = var->data.location_frac;
+   unsigned driver_location = var->data.driver_location;
+   LLVMValueRef dw_addr, stride;
+   LLVMValueRef buffer, base, addr;
+   LLVMValueRef values[8];
+   bool skip_lds_store;
+   bool is_tess_factor = false, is_tess_inner = false;
+
+   if (var->data.compact) {
+      component += const_index;
+      writemask <<= const_index;
+      const_index = 0;
+   }
+
+   driver_location = driver_location / 4;
+   ubyte name = info->output_semantic_name[driver_location];
+   ubyte index = info->output_semantic_index[driver_location];
+
+   bool is_const = !param_index;
+   if (!param_index)
+      param_index = LLVMConstInt(ctx->ac.i32, const_index, 0);
+
+   const bool is_patch = var->data.patch || var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+                         var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
+
+   /* Invalid SPIR-V can cause this. */
+   if ((name == TGSI_SEMANTIC_PATCH || name == TGSI_SEMANTIC_TESSINNER ||
+        name == TGSI_SEMANTIC_TESSOUTER) != is_patch)
+      return;
+
+   if (!is_patch) {
+      stride = get_tcs_out_vertex_dw_stride(ctx);
+      dw_addr = get_tcs_out_current_patch_offset(ctx);
+      dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
+                                                    name, index);
+
+      skip_lds_store = !info->reads_pervertex_outputs;
+   } else {
+      dw_addr = get_tcs_out_current_patch_data_offset(ctx);
+      dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
+                                                    name, index);
+
+      skip_lds_store = !info->reads_perpatch_outputs;
+
+      if (is_const && const_index == 0) {
+         int name = info->output_semantic_name[driver_location];
+
+         /* Always write tess factors into LDS for the TCS epilog. */
+         if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) {
+            /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
+            skip_lds_store = !info->reads_tessfactor_outputs &&
+                             ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
+            is_tess_factor = true;
+            is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
+         }
+      }
+   }
+
+   buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+   addr =
+      get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, name, index);
+
+   for (unsigned chan = component; chan < 8; chan++) {
+      if (!(writemask & (1 << chan)))
+         continue;
+      LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
+
+      unsigned buffer_store_offset = chan % 4;
+      if (chan == 4) {
+         ubyte name = info->output_semantic_name[driver_location + 1];
+         ubyte index = info->output_semantic_index[driver_location + 1];
+         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index,
+                                                                name, index);
+      }
+
+      /* Skip LDS stores if there is no LDS read of this output. */
+      if (!skip_lds_store)
+         lshs_lds_store(ctx, chan, dw_addr, value);
+
+      value = ac_to_integer(&ctx->ac, value);
+      values[chan] = value;
+
+      if (writemask != 0xF && !is_tess_factor) {
+         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, addr, base,
+                                     4 * buffer_store_offset, ac_glc);
+      }
+
+      /* Write tess factors into VGPRs for the epilog. */
+      if (is_tess_factor && ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+         if (!is_tess_inner) {
+            LLVMBuildStore(ctx->ac.builder, value, /* outer */
+                           ctx->invoc0_tess_factors[chan]);
+         } else if (chan < 2) {
+            LLVMBuildStore(ctx->ac.builder, value, /* inner */
+                           ctx->invoc0_tess_factors[4 + chan]);
+         }
+      }
+   }
+
+   if (writemask == 0xF && !is_tess_factor) {
+      LLVMValueRef value = ac_build_gather_values(&ctx->ac, values, 4);
+      ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr, base, 0, ac_glc);
+   }
 }
 
 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       LLVMValueRef coord[4] = {
-               ac_get_arg(&ctx->ac, ctx->tes_u),
-               ac_get_arg(&ctx->ac, ctx->tes_v),
-               ctx->ac.f32_0,
-               ctx->ac.f32_0
-       };
-
-       /* For triangles, the vector should be (u, v, 1-u-v). */
-       if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
-           PIPE_PRIM_TRIANGLES) {
-               coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
-                                        LLVMBuildFAdd(ctx->ac.builder,
-                                                      coord[0], coord[1], ""), "");
-       }
-       return ac_build_gather_values(&ctx->ac, coord, 4);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef coord[4] = {ac_get_arg(&ctx->ac, ctx->tes_u), ac_get_arg(&ctx->ac, ctx->tes_v),
+                            ctx->ac.f32_0, ctx->ac.f32_0};
+
+   /* For triangles, the vector should be (u, v, 1-u-v). */
+   if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_TRIANGLES) {
+      coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
+                               LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
+   }
+   return ac_build_gather_values(&ctx->ac, coord, 4);
 }
 
-static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
-                                   unsigned semantic_name)
+static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic_name)
 {
-       LLVMValueRef base, addr;
-
-       int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
+   LLVMValueRef base, addr;
 
-       base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-       addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
-                                         LLVMConstInt(ctx->i32, param, 0));
+   int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
 
-       return buffer_load(ctx, ctx->f32,
-                          ~0, ctx->tess_offchip_ring, base, addr, true);
+   base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+   addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
+                                     LLVMConstInt(ctx->ac.i32, param, 0));
 
+   return buffer_load(ctx, ctx->ac.f32, ~0, ctx->tess_offchip_ring, base, addr, true);
 }
 
-static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx,
-                                           unsigned semantic_name)
+static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx, unsigned semantic_name)
 {
-       LLVMValueRef buf, slot, val[4];
-       int i, offset;
-
-       slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
-       buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
-       offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
-
-       for (i = 0; i < 4; i++)
-               val[i] = si_buffer_load_const(ctx, buf,
-                                             LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
-       return ac_build_gather_values(&ctx->ac, val, 4);
+   LLVMValueRef buf, slot, val[4];
+   int i, offset;
+
+   slot = LLVMConstInt(ctx->ac.i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
+   buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
+   offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
+
+   for (i = 0; i < 4; i++)
+      val[i] = si_buffer_load_const(ctx, buf, LLVMConstInt(ctx->ac.i32, (offset + i) * 4, 0));
+   return ac_build_gather_values(&ctx->ac, val, 4);
 }
 
-static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
-                                      unsigned varying_id,
-                                      bool load_default_state)
+static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi, unsigned varying_id,
+                                       bool load_default_state)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       unsigned semantic_name;
-
-       if (load_default_state) {
-               switch (varying_id) {
-               case VARYING_SLOT_TESS_LEVEL_INNER:
-                       semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
-                       break;
-               case VARYING_SLOT_TESS_LEVEL_OUTER:
-                       semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
-                       break;
-               default:
-                       unreachable("unknown tess level");
-               }
-               return load_tess_level_default(ctx, semantic_name);
-       }
-
-       switch (varying_id) {
-       case VARYING_SLOT_TESS_LEVEL_INNER:
-               semantic_name = TGSI_SEMANTIC_TESSINNER;
-               break;
-       case VARYING_SLOT_TESS_LEVEL_OUTER:
-               semantic_name = TGSI_SEMANTIC_TESSOUTER;
-               break;
-       default:
-               unreachable("unknown tess level");
-       }
-
-       return load_tess_level(ctx, semantic_name);
-
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   unsigned semantic_name;
+
+   if (load_default_state) {
+      switch (varying_id) {
+      case VARYING_SLOT_TESS_LEVEL_INNER:
+         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
+         break;
+      case VARYING_SLOT_TESS_LEVEL_OUTER:
+         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
+         break;
+      default:
+         unreachable("unknown tess level");
+      }
+      return load_tess_level_default(ctx, semantic_name);
+   }
+
+   switch (varying_id) {
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+      semantic_name = TGSI_SEMANTIC_TESSINNER;
+      break;
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+      semantic_name = TGSI_SEMANTIC_TESSOUTER;
+      break;
+   default:
+      unreachable("unknown tess level");
+   }
+
+   return load_tess_level(ctx, semantic_name);
 }
 
 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       if (ctx->type == PIPE_SHADER_TESS_CTRL)
-               return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
-       else if (ctx->type == PIPE_SHADER_TESS_EVAL)
-               return get_num_tcs_out_vertices(ctx);
-       else
-               unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   if (ctx->type == PIPE_SHADER_TESS_CTRL)
+      return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
+   else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+      return get_num_tcs_out_vertices(ctx);
+   else
+      unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
 }
 
 /**
@@ -791,494 +714,450 @@ static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
  */
 static void si_copy_tcs_inputs(struct si_shader_context *ctx)
 {
-       LLVMValueRef invocation_id, buffer, buffer_offset;
-       LLVMValueRef lds_vertex_stride, lds_base;
-       uint64_t inputs;
+   LLVMValueRef invocation_id, buffer, buffer_offset;
+   LLVMValueRef lds_vertex_stride, lds_base;
+   uint64_t inputs;
 
-       invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
-       buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-       buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+   invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+   buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+   buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
 
-       lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
-       lds_base = get_tcs_in_current_patch_offset(ctx);
-       lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
-                                lds_base);
+   lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
+   lds_base = get_tcs_in_current_patch_offset(ctx);
+   lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base);
 
-       inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
-       while (inputs) {
-               unsigned i = u_bit_scan64(&inputs);
+   inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
+   while (inputs) {
+      unsigned i = u_bit_scan64(&inputs);
 
-               LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
-                                           LLVMConstInt(ctx->i32, 4 * i, 0),
-                                            "");
+      LLVMValueRef lds_ptr =
+         LLVMBuildAdd(ctx->ac.builder, lds_base, LLVMConstInt(ctx->ac.i32, 4 * i, 0), "");
 
-               LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
-                                             get_rel_patch_id(ctx),
-                                             invocation_id,
-                                             LLVMConstInt(ctx->i32, i, 0));
+      LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(
+         ctx, get_rel_patch_id(ctx), invocation_id, LLVMConstInt(ctx->ac.i32, i, 0));
 
-               LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
+      LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
 
-               ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
-                                           buffer_offset, 0, ac_glc);
-       }
+      ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr, buffer_offset, 0,
+                                  ac_glc);
+   }
 }
 
-static void si_write_tess_factors(struct si_shader_context *ctx,
-                                 LLVMValueRef rel_patch_id,
-                                 LLVMValueRef invocation_id,
-                                 LLVMValueRef tcs_out_current_patch_data_offset,
-                                 LLVMValueRef invoc0_tf_outer[4],
-                                 LLVMValueRef invoc0_tf_inner[2])
+static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef rel_patch_id,
+                                  LLVMValueRef invocation_id,
+                                  LLVMValueRef tcs_out_current_patch_data_offset,
+                                  LLVMValueRef invoc0_tf_outer[4], LLVMValueRef invoc0_tf_inner[2])
 {
-       struct si_shader *shader = ctx->shader;
-       unsigned tess_inner_index, tess_outer_index;
-       LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
-       LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
-       unsigned stride, outer_comps, inner_comps, i, offset;
-
-       /* Add a barrier before loading tess factors from LDS. */
-       if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
-               si_llvm_emit_barrier(ctx);
-
-       /* Do this only for invocation 0, because the tess levels are per-patch,
-        * not per-vertex.
-        *
-        * This can't jump, because invocation 0 executes this. It should
-        * at least mask out the loads and stores for other invocations.
-        */
-       ac_build_ifcc(&ctx->ac,
-                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
-                                   invocation_id, ctx->i32_0, ""), 6503);
-
-       /* Determine the layout of one tess factor element in the buffer. */
-       switch (shader->key.part.tcs.epilog.prim_mode) {
-       case PIPE_PRIM_LINES:
-               stride = 2; /* 2 dwords, 1 vec2 store */
-               outer_comps = 2;
-               inner_comps = 0;
-               break;
-       case PIPE_PRIM_TRIANGLES:
-               stride = 4; /* 4 dwords, 1 vec4 store */
-               outer_comps = 3;
-               inner_comps = 1;
-               break;
-       case PIPE_PRIM_QUADS:
-               stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
-               outer_comps = 4;
-               inner_comps = 2;
-               break;
-       default:
-               assert(0);
-               return;
-       }
-
-       for (i = 0; i < 4; i++) {
-               inner[i] = LLVMGetUndef(ctx->i32);
-               outer[i] = LLVMGetUndef(ctx->i32);
-       }
-
-       if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
-               /* Tess factors are in VGPRs. */
-               for (i = 0; i < outer_comps; i++)
-                       outer[i] = out[i] = invoc0_tf_outer[i];
-               for (i = 0; i < inner_comps; i++)
-                       inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
-       } else {
-               /* Load tess_inner and tess_outer from LDS.
-                * Any invocation can write them, so we can't get them from a temporary.
-                */
-               tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
-               tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
-
-               lds_base = tcs_out_current_patch_data_offset;
-               lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
-                                        LLVMConstInt(ctx->i32,
-                                                     tess_inner_index * 4, 0), "");
-               lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
-                                        LLVMConstInt(ctx->i32,
-                                                     tess_outer_index * 4, 0), "");
-
-               for (i = 0; i < outer_comps; i++) {
-                       outer[i] = out[i] =
-                               lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
-               }
-               for (i = 0; i < inner_comps; i++) {
-                       inner[i] = out[outer_comps+i] =
-                               lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
-               }
-       }
-
-       if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
-               /* For isolines, the hardware expects tess factors in the
-                * reverse order from what NIR specifies.
-                */
-               LLVMValueRef tmp = out[0];
-               out[0] = out[1];
-               out[1] = tmp;
-       }
-
-       /* Convert the outputs to vectors for stores. */
-       vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
-       vec1 = NULL;
-
-       if (stride > 4)
-               vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
-
-       /* Get the buffer. */
-       buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
-
-       /* Get the offset. */
-       tf_base = ac_get_arg(&ctx->ac,
-                            ctx->tcs_factor_offset);
-       byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
-                                 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
-
-       ac_build_ifcc(&ctx->ac,
-                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
-                                   rel_patch_id, ctx->i32_0, ""), 6504);
-
-       /* Store the dynamic HS control word. */
-       offset = 0;
-       if (ctx->screen->info.chip_class <= GFX8) {
-               ac_build_buffer_store_dword(&ctx->ac, buffer,
-                                           LLVMConstInt(ctx->i32, 0x80000000, 0),
-                                           1, ctx->i32_0, tf_base,
-                                           offset, ac_glc);
-               offset += 4;
-       }
-
-       ac_build_endif(&ctx->ac, 6504);
-
-       /* Store the tessellation factors. */
-       ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
-                                   MIN2(stride, 4), byteoffset, tf_base,
-                                   offset, ac_glc);
-       offset += 16;
-       if (vec1)
-               ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
-                                           stride - 4, byteoffset, tf_base,
-                                           offset, ac_glc);
-
-       /* Store the tess factors into the offchip buffer if TES reads them. */
-       if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
-               LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
-               LLVMValueRef tf_inner_offset;
-               unsigned param_outer, param_inner;
-
-               buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
-               base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
-
-               param_outer = si_shader_io_get_unique_index_patch(
-                                     TGSI_SEMANTIC_TESSOUTER, 0);
-               tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
-                                       LLVMConstInt(ctx->i32, param_outer, 0));
-
-               unsigned outer_vec_size =
-                       ac_has_vec3_support(ctx->screen->info.chip_class, false) ?
-                               outer_comps : util_next_power_of_two(outer_comps);
-               outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
-
-               ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
-                                           outer_comps, tf_outer_offset,
-                                           base, 0, ac_glc);
-               if (inner_comps) {
-                       param_inner = si_shader_io_get_unique_index_patch(
-                                             TGSI_SEMANTIC_TESSINNER, 0);
-                       tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
-                                       LLVMConstInt(ctx->i32, param_inner, 0));
-
-                       inner_vec = inner_comps == 1 ? inner[0] :
-                                   ac_build_gather_values(&ctx->ac, inner, inner_comps);
-                       ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
-                                                   inner_comps, tf_inner_offset,
-                                                   base, 0, ac_glc);
-               }
-       }
-
-       ac_build_endif(&ctx->ac, 6503);
+   struct si_shader *shader = ctx->shader;
+   unsigned tess_inner_index, tess_outer_index;
+   LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
+   LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
+   unsigned stride, outer_comps, inner_comps, i, offset;
+
+   /* Add a barrier before loading tess factors from LDS. */
+   if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+      si_llvm_emit_barrier(ctx);
+
+   /* Do this only for invocation 0, because the tess levels are per-patch,
+    * not per-vertex.
+    *
+    * This can't jump, because invocation 0 executes this. It should
+    * at least mask out the loads and stores for other invocations.
+    */
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
+
+   /* Determine the layout of one tess factor element in the buffer. */
+   switch (shader->key.part.tcs.epilog.prim_mode) {
+   case PIPE_PRIM_LINES:
+      stride = 2; /* 2 dwords, 1 vec2 store */
+      outer_comps = 2;
+      inner_comps = 0;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      stride = 4; /* 4 dwords, 1 vec4 store */
+      outer_comps = 3;
+      inner_comps = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
+      outer_comps = 4;
+      inner_comps = 2;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   for (i = 0; i < 4; i++) {
+      inner[i] = LLVMGetUndef(ctx->ac.i32);
+      outer[i] = LLVMGetUndef(ctx->ac.i32);
+   }
+
+   if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+      /* Tess factors are in VGPRs. */
+      for (i = 0; i < outer_comps; i++)
+         outer[i] = out[i] = invoc0_tf_outer[i];
+      for (i = 0; i < inner_comps; i++)
+         inner[i] = out[outer_comps + i] = invoc0_tf_inner[i];
+   } else {
+      /* Load tess_inner and tess_outer from LDS.
+       * Any invocation can write them, so we can't get them from a temporary.
+       */
+      tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+      tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+
+      lds_base = tcs_out_current_patch_data_offset;
+      lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
+                               LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, 0), "");
+      lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
+                               LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, 0), "");
+
+      for (i = 0; i < outer_comps; i++) {
+         outer[i] = out[i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
+      }
+      for (i = 0; i < inner_comps; i++) {
+         inner[i] = out[outer_comps + i] = lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
+      }
+   }
+
+   if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
+      /* For isolines, the hardware expects tess factors in the
+       * reverse order from what NIR specifies.
+       */
+      LLVMValueRef tmp = out[0];
+      out[0] = out[1];
+      out[1] = tmp;
+   }
+
+   /* Convert the outputs to vectors for stores. */
+   vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
+   vec1 = NULL;
+
+   if (stride > 4)
+      vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
+
+   /* Get the buffer. */
+   buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
+
+   /* Get the offset. */
+   tf_base = ac_get_arg(&ctx->ac, ctx->tcs_factor_offset);
+   byteoffset =
+      LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, 0), "");
+
+   ac_build_ifcc(&ctx->ac,
+                 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
+
+   /* Store the dynamic HS control word. */
+   offset = 0;
+   if (ctx->screen->info.chip_class <= GFX8) {
+      ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0), 1,
+                                  ctx->ac.i32_0, tf_base, offset, ac_glc);
+      offset += 4;
+   }
+
+   ac_build_endif(&ctx->ac, 6504);
+
+   /* Store the tessellation factors. */
+   ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, offset,
+                               ac_glc);
+   offset += 16;
+   if (vec1)
+      ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, offset,
+                                  ac_glc);
+
+   /* Store the tess factors into the offchip buffer if TES reads them. */
+   if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
+      LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
+      LLVMValueRef tf_inner_offset;
+      unsigned param_outer, param_inner;
+
+      buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
+      base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
+
+      param_outer = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+      tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+                                                   LLVMConstInt(ctx->ac.i32, param_outer, 0));
+
+      unsigned outer_vec_size = ac_has_vec3_support(ctx->screen->info.chip_class, false)
+                                   ? outer_comps
+                                   : util_next_power_of_two(outer_comps);
+      outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
+
+      ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, outer_comps, tf_outer_offset, base, 0,
+                                  ac_glc);
+      if (inner_comps) {
+         param_inner = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
+                                                      LLVMConstInt(ctx->ac.i32, param_inner, 0));
+
+         inner_vec =
+            inner_comps == 1 ? inner[0] : ac_build_gather_values(&ctx->ac, inner, inner_comps);
+         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, inner_comps, tf_inner_offset, base,
+                                     0, ac_glc);
+      }
+   }
+
+   ac_build_endif(&ctx->ac, 6503);
 }
 
 /* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
-                                     unsigned max_outputs,
-                                     LLVMValueRef *addrs)
+static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                      LLVMValueRef *addrs)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
-
-       si_copy_tcs_inputs(ctx);
-
-       rel_patch_id = get_rel_patch_id(ctx);
-       invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
-       tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               LLVMBasicBlockRef blocks[2] = {
-                       LLVMGetInsertBlock(builder),
-                       ctx->merged_wrap_if_entry_block
-               };
-               LLVMValueRef values[2];
-
-               ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-
-               values[0] = rel_patch_id;
-               values[1] = LLVMGetUndef(ctx->i32);
-               rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
-
-               values[0] = tf_lds_offset;
-               values[1] = LLVMGetUndef(ctx->i32);
-               tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
-
-               values[0] = invocation_id;
-               values[1] = ctx->i32_1; /* cause the epilog to skip threads */
-               invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
-       }
-
-       /* Return epilog parameters from this function. */
-       LLVMValueRef ret = ctx->return_value;
-       unsigned vgpr;
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-                                         8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-                                         8 + GFX9_SGPR_TCS_OUT_LAYOUT);
-               /* Tess offchip and tess factor offsets are at the beginning. */
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
-               vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
-       } else {
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-                                         GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-                                         GFX6_SGPR_TCS_OUT_LAYOUT);
-               /* Tess offchip and tess factor offsets are after user SGPRs. */
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset,
-                                         GFX6_TCS_NUM_USER_SGPR);
-               ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset,
-                                         GFX6_TCS_NUM_USER_SGPR + 1);
-               vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
-       }
-
-       /* VGPRs */
-       rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
-       invocation_id = ac_to_float(&ctx->ac, invocation_id);
-       tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
-
-       /* Leave a hole corresponding to the two input VGPRs. This ensures that
-        * the invocation_id output does not alias the tcs_rel_ids input,
-        * which saves a V_MOV on gfx9.
-        */
-       vgpr += 2;
-
-       ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
-       ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
-
-       if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
-               vgpr++; /* skip the tess factor LDS offset */
-               for (unsigned i = 0; i < 6; i++) {
-                       LLVMValueRef value =
-                               LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
-                       value = ac_to_float(&ctx->ac, value);
-                       ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
-               }
-       } else {
-               ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
-       }
-       ctx->return_value = ret;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+
+   si_copy_tcs_inputs(ctx);
+
+   rel_patch_id = get_rel_patch_id(ctx);
+   invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
+   tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      LLVMBasicBlockRef blocks[2] = {LLVMGetInsertBlock(builder), ctx->merged_wrap_if_entry_block};
+      LLVMValueRef values[2];
+
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
+      values[0] = rel_patch_id;
+      values[1] = LLVMGetUndef(ctx->ac.i32);
+      rel_patch_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+      values[0] = tf_lds_offset;
+      values[1] = LLVMGetUndef(ctx->ac.i32);
+      tf_lds_offset = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+
+      values[0] = invocation_id;
+      values[1] = ctx->ac.i32_1; /* cause the epilog to skip threads */
+      invocation_id = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, values, blocks);
+   }
+
+   /* Return epilog parameters from this function. */
+   LLVMValueRef ret = ctx->return_value;
+   unsigned vgpr;
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      ret =
+         si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+      /* Tess offchip and tess factor offsets are at the beginning. */
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+      vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
+   } else {
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, GFX6_SGPR_TCS_OUT_LAYOUT);
+      /* Tess offchip and tess factor offsets are after user SGPRs. */
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, GFX6_TCS_NUM_USER_SGPR);
+      ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, GFX6_TCS_NUM_USER_SGPR + 1);
+      vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
+   }
+
+   /* VGPRs */
+   rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
+   invocation_id = ac_to_float(&ctx->ac, invocation_id);
+   tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
+
+   /* Leave a hole corresponding to the two input VGPRs. This ensures that
+    * the invocation_id output does not alias the tcs_rel_ids input,
+    * which saves a V_MOV on gfx9.
+    */
+   vgpr += 2;
+
+   ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
+   ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
+
+   if (ctx->shader->selector->info.tessfactors_are_def_in_all_invocs) {
+      vgpr++; /* skip the tess factor LDS offset */
+      for (unsigned i = 0; i < 6; i++) {
+         LLVMValueRef value = LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
+         value = ac_to_float(&ctx->ac, value);
+         ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
+      }
+   } else {
+      ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+   }
+   ctx->return_value = ret;
 }
 
 /* Pass TCS inputs from LS to TCS on GFX9. */
 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
 {
-       LLVMValueRef ret = ctx->return_value;
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-                                 8 + SI_SGPR_RW_BUFFERS);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->bindless_samplers_and_images,
-                                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-
-       ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits,
-                                 8 + SI_SGPR_VS_STATE_BITS);
-
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
-                                 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets,
-                                 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
-       ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
-                                 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
-
-       unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
-       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                  ac_to_float(&ctx->ac,
-                                              ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
-                                  vgpr++, "");
-       ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
-                                  ac_to_float(&ctx->ac,
-                                              ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
-                                  vgpr++, "");
-       ctx->return_value = ret;
+   LLVMValueRef ret = ctx->return_value;
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+
+   ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
+   ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout, 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+
+   unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+                              ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
+                              vgpr++, "");
+   ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
+                              ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
+                              vgpr++, "");
+   ctx->return_value = ret;
 }
 
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs)
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
 {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *shader = ctx->shader;
-       struct si_shader_info *info = &shader->selector->info;
-       unsigned i, chan;
-       LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
-       LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
-       LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
-                                                vertex_dw_stride, "");
-
-       /* Write outputs to LDS. The next shader (TCS aka HS) will read
-        * its inputs from it. */
-       for (i = 0; i < info->num_outputs; i++) {
-               unsigned name = info->output_semantic_name[i];
-               unsigned index = info->output_semantic_index[i];
-
-               /* The ARB_shader_viewport_layer_array spec contains the
-                * following issue:
-                *
-                *    2) What happens if gl_ViewportIndex or gl_Layer is
-                *    written in the vertex shader and a geometry shader is
-                *    present?
-                *
-                *    RESOLVED: The value written by the last vertex processing
-                *    stage is used. If the last vertex processing stage
-                *    (vertex, tessellation evaluation or geometry) does not
-                *    statically assign to gl_ViewportIndex or gl_Layer, index
-                *    or layer zero is assumed.
-                *
-                * So writes to those outputs in VS-as-LS are simply ignored.
-                */
-               if (name == TGSI_SEMANTIC_LAYER ||
-                   name == TGSI_SEMANTIC_VIEWPORT_INDEX)
-                       continue;
-
-               int param = si_shader_io_get_unique_index(name, index, false);
-               LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
-                                       LLVMConstInt(ctx->i32, param * 4, 0), "");
-
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)))
-                               continue;
-
-                       lshs_lds_store(ctx, chan, dw_addr,
-                                 LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
-               }
-       }
-
-       if (ctx->screen->info.chip_class >= GFX9)
-               si_set_ls_return_value_for_tcs(ctx);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   struct si_shader_info *info = &shader->selector->info;
+   unsigned i, chan;
+   LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
+   LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
+   LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
+
+   /* Write outputs to LDS. The next shader (TCS aka HS) will read
+    * its inputs from it. */
+   for (i = 0; i < info->num_outputs; i++) {
+      unsigned name = info->output_semantic_name[i];
+      unsigned index = info->output_semantic_index[i];
+
+      /* The ARB_shader_viewport_layer_array spec contains the
+       * following issue:
+       *
+       *    2) What happens if gl_ViewportIndex or gl_Layer is
+       *    written in the vertex shader and a geometry shader is
+       *    present?
+       *
+       *    RESOLVED: The value written by the last vertex processing
+       *    stage is used. If the last vertex processing stage
+       *    (vertex, tessellation evaluation or geometry) does not
+       *    statically assign to gl_ViewportIndex or gl_Layer, index
+       *    or layer zero is assumed.
+       *
+       * So writes to those outputs in VS-as-LS are simply ignored.
+       */
+      if (name == TGSI_SEMANTIC_LAYER || name == TGSI_SEMANTIC_VIEWPORT_INDEX)
+         continue;
+
+      int param = si_shader_io_get_unique_index(name, index, false);
+      LLVMValueRef dw_addr =
+         LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, 0), "");
+
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)))
+            continue;
+
+         lshs_lds_store(ctx, chan, dw_addr,
+                        LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
+      }
+   }
+
+   if (ctx->screen->info.chip_class >= GFX9)
+      si_set_ls_return_value_for_tcs(ctx);
 }
 
 /**
  * Compile the TCS epilog function. This writes tesselation factors to memory
  * based on the output primitive type of the tesselator (determined by TES).
  */
-void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
-                             union si_shader_part_key *key)
+void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
 {
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_factor_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_out_lds_layout);
-       } else {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_out_lds_layout);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_offchip_offset);
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
-                          &ctx->tcs_factor_offset);
-       }
-
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
-       struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
-       struct ac_arg invocation_id; /* invocation ID within the patch */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
-       struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
-       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
-                  &tcs_out_current_patch_data_offset);
-
-       struct ac_arg tess_factors[6];
-       for (unsigned i = 0; i < 6; i++)
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "tcs_epilog", NULL, 0,
-                           ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
-       ac_declare_lds_as_pointer(&ctx->ac);
-
-       LLVMValueRef invoc0_tess_factors[6];
-       for (unsigned i = 0; i < 6; i++)
-               invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
-
-       si_write_tess_factors(ctx,
-                             ac_get_arg(&ctx->ac, rel_patch_id),
-                             ac_get_arg(&ctx->ac, invocation_id),
-                             ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
-                             invoc0_tess_factors, invoc0_tess_factors + 4);
-
-       LLVMBuildRetVoid(ctx->ac.builder);
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+   } else {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
+   }
+
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
+   struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
+   struct ac_arg invocation_id; /* invocation ID within the patch */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
+   struct ac_arg
+      tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
+   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tcs_out_current_patch_data_offset);
+
+   struct ac_arg tess_factors[6];
+   for (unsigned i = 0; i < 6; i++)
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "tcs_epilog", NULL, 0, ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
+   ac_declare_lds_as_pointer(&ctx->ac);
+
+   LLVMValueRef invoc0_tess_factors[6];
+   for (unsigned i = 0; i < 6; i++)
+      invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
+
+   si_write_tess_factors(ctx, ac_get_arg(&ctx->ac, rel_patch_id),
+                         ac_get_arg(&ctx->ac, invocation_id),
+                         ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
+                         invoc0_tess_factors, invoc0_tess_factors + 4);
+
+   LLVMBuildRetVoid(ctx->ac.builder);
 }
 
 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
 {
-       ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
-       ctx->abi.load_tess_level = si_load_tess_level;
-       ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
-       ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
-       ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+   ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
+   ctx->abi.load_tess_level = si_load_tess_level;
+   ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
+   ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
+   ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
 }
 
-void si_llvm_init_tes_callbacks(struct si_shader_context *ctx)
+void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
-       ctx->abi.load_tess_varyings = si_nir_load_input_tes;
-       ctx->abi.load_tess_coord = si_load_tess_coord;
-       ctx->abi.load_tess_level = si_load_tess_level;
-       ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+   ctx->abi.load_tess_varyings = si_nir_load_input_tes;
+   ctx->abi.load_tess_coord = si_load_tess_coord;
+   ctx->abi.load_tess_level = si_load_tess_level;
+   ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+
+   if (ctx->shader->key.as_es)
+      ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+   else if (ngg_cull_shader)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
+   else if (ctx->shader->key.as_ngg)
+      ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+   else
+      ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
 }