radeonsi: remove redundant GS variables in si_shader_selector

[mesa.git] / src / gallium / drivers / radeonsi / si_shader_llvm_gs.c
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c

index 69dc7080a6c622bc773977ebcc50bcb809c0095f..3bbb18019f3c5c3d4a47cf63dc923853528509b4 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -22,759 +22,690 @@
   * USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
-#include "si_shader_internal.h"
  #include "si_pipe.h"
+#include "si_shader_internal.h"
  #include "sid.h"
  #include "util/u_memory.h"
  
  LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
  {
-       /* Return true if the current thread should execute an ES thread. */
-       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                            ac_get_thread_id(&ctx->ac),
-                            si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+   /* Return true if the current thread should execute an ES thread. */
+   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+                        si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
  }
  
  LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
  {
-       /* Return true if the current thread should execute a GS thread. */
-       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                            ac_get_thread_id(&ctx->ac),
-                            si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+   /* Return true if the current thread should execute a GS thread. */
+   return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
+                        si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
  }
  
-static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
-                                         unsigned input_index,
-                                         unsigned vtx_offset_param,
-                                         LLVMTypeRef type,
-                                         unsigned swizzle)
+static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
+                                          unsigned vtx_offset_param, LLVMTypeRef type,
+                                          unsigned swizzle)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *shader = ctx->shader;
-       LLVMValueRef vtx_offset, soffset;
-       struct si_shader_info *info = &shader->selector->info;
-       unsigned semantic_name = info->input_semantic_name[input_index];
-       unsigned semantic_index = info->input_semantic_index[input_index];
-       unsigned param;
-       LLVMValueRef value;
-
-       param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
-
-       /* GFX9 has the ESGS ring in LDS. */
-       if (ctx->screen->info.chip_class >= GFX9) {
-               unsigned index = vtx_offset_param;
-
-               switch (index / 2) {
-               case 0:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               case 1:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               case 2:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               default:
-                       assert(0);
-                       return NULL;
-               }
-
-               unsigned offset = param * 4 + swizzle;
-               vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
-                                         LLVMConstInt(ctx->i32, offset, false), "");
-
-               LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
-               LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
-               if (ac_get_type_size(type) == 64) {
-                       ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
-                                          &ctx->ac.i32_1, 1, "");
-                       LLVMValueRef values[2] = {
-                               value,
-                               LLVMBuildLoad(ctx->ac.builder, ptr, "")
-                       };
-                       value = ac_build_gather_values(&ctx->ac, values, 2);
-               }
-               return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
-       }
-
-       /* GFX6: input load from the ESGS ring in memory. */
-       if (swizzle == ~0) {
-               LLVMValueRef values[4];
-               unsigned chan;
-               for (chan = 0; chan < 4; chan++) {
-                       values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
-                                                            type, chan);
-               }
-               return ac_build_gather_values(&ctx->ac, values, 4);
-       }
-
-       /* Get the vertex offset parameter on GFX6. */
-       LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
-                                               ctx->gs_vtx_offset[vtx_offset_param]);
-
-       vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
-                                 LLVMConstInt(ctx->i32, 4, 0), "");
-
-       soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
-
-       value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
-                                    vtx_offset, soffset, 0, ac_glc, true, false);
-       if (ac_get_type_size(type) == 64) {
-               LLVMValueRef value2;
-               soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
-
-               value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
-                                             ctx->i32_0, vtx_offset, soffset,
-                                             0, ac_glc, true, false);
-               return si_build_gather_64bit(ctx, type, value, value2);
-       }
-       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *shader = ctx->shader;
+   LLVMValueRef vtx_offset, soffset;
+   struct si_shader_info *info = &shader->selector->info;
+   unsigned param;
+   LLVMValueRef value;
+
+   param = si_shader_io_get_unique_index(info->input_semantic[input_index], false);
+
+   /* GFX9 has the ESGS ring in LDS. */
+   if (ctx->screen->info.chip_class >= GFX9) {
+      unsigned index = vtx_offset_param;
+
+      switch (index / 2) {
+      case 0:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
+         break;
+      case 1:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
+         break;
+      case 2:
+         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
+         break;
+      default:
+         assert(0);
+         return NULL;
+      }
+
+      unsigned offset = param * 4 + swizzle;
+      vtx_offset =
+         LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
+
+      LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
+      LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+      if (ac_get_type_size(type) == 8) {
+         ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, "");
+         LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")};
+         value = ac_build_gather_values(&ctx->ac, values, 2);
+      }
+      return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+   }
+
+   /* GFX6: input load from the ESGS ring in memory. */
+   if (swizzle == ~0) {
+      LLVMValueRef values[4];
+      unsigned chan;
+      for (chan = 0; chan < 4; chan++) {
+         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan);
+      }
+      return ac_build_gather_values(&ctx->ac, values, 4);
+   }
+
+   /* Get the vertex offset parameter on GFX6. */
+   LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
+
+   vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+   soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
+
+   value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
+                                ac_glc, true, false);
+   if (ac_get_type_size(type) == 8) {
+      LLVMValueRef value2;
+      soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
+
+      value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset,
+                                    0, ac_glc, true, false);
+      return si_build_gather_64bit(ctx, type, value, value2);
+   }
+   return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
  }
  
-static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
-                                        unsigned location,
-                                        unsigned driver_location,
-                                        unsigned component,
-                                        unsigned num_components,
-                                        unsigned vertex_index,
-                                        unsigned const_index,
-                                        LLVMTypeRef type)
+static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned location,
+                                         unsigned driver_location, unsigned component,
+                                         unsigned num_components, unsigned vertex_index,
+                                         unsigned const_index, LLVMTypeRef type)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
  
-       LLVMValueRef value[4];
-       for (unsigned i = 0; i < num_components; i++) {
-               unsigned offset = i;
-               if (ac_get_type_size(type) == 64)
-                       offset *= 2;
+   LLVMValueRef value[4];
+   for (unsigned i = 0; i < num_components; i++) {
+      unsigned offset = i;
+      if (ac_get_type_size(type) == 8)
+         offset *= 2;
  
-               offset += component;
-               value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4 + const_index,
-                                                            vertex_index, type, offset);
-       }
+      offset += component;
+      value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
+                                                   vertex_index, type, offset);
+   }
  
-       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+   return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
  }
  
  /* Pass GS inputs from ES to GS on GFX9. */
  static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
  {
-       LLVMValueRef ret = ctx->return_value;
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-       if (ctx->shader->key.as_ngg)
-               ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
-       else
-               ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-                                 8 + SI_SGPR_RW_BUFFERS);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->bindless_samplers_and_images,
-                                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-       if (ctx->screen->use_ngg) {
-               ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
-                                         8 + SI_SGPR_VS_STATE_BITS);
-       }
-
-       unsigned vgpr;
-       if (ctx->type == PIPE_SHADER_VERTEX)
-               vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
-       else
-               vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
-
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
-       ctx->return_value = ret;
+   LLVMValueRef ret = ctx->return_value;
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+   ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+   if (ctx->shader->key.as_ngg)
+      ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
+   else
+      ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+   ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+   ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
+   ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
+                             8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+   if (ctx->screen->use_ngg) {
+      ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
+   }
+
+   unsigned vgpr;
+   if (ctx->stage == MESA_SHADER_VERTEX)
+      vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
+   else
+      vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
+   ctx->return_value = ret;
  }
  
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                             LLVMValueRef *addrs)
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *es = ctx->shader;
-       struct si_shader_info *info = &es->selector->info;
-       LLVMValueRef lds_base = NULL;
-       unsigned chan;
-       int i;
-
-       if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
-               unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
-               LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
-               LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
-               vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
-                                        LLVMBuildMul(ctx->ac.builder, wave_idx,
-                                                     LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), "");
-               lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
-                                       LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
-       }
-
-       for (i = 0; i < info->num_outputs; i++) {
-               int param;
-
-               if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
-                   info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
-                       continue;
-
-               param = si_shader_io_get_unique_index(info->output_semantic_name[i],
-                                                     info->output_semantic_index[i], false);
-
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)))
-                               continue;
-
-                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-                       out_val = ac_to_integer(&ctx->ac, out_val);
-
-                       /* GFX9 has the ESGS ring in LDS. */
-                       if (ctx->screen->info.chip_class >= GFX9) {
-                               LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false);
-                               idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
-                               ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
-                               continue;
-                       }
-
-                       ac_build_buffer_store_dword(&ctx->ac,
-                                                   ctx->esgs_ring,
-                                                   out_val, 1, NULL,
-                                                   ac_get_arg(&ctx->ac, ctx->es2gs_offset),
-                                                   (4 * param + chan) * 4,
-                                                   ac_glc | ac_slc | ac_swizzled);
-               }
-       }
-
-       if (ctx->screen->info.chip_class >= GFX9)
-               si_set_es_return_value_for_gs(ctx);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader *es = ctx->shader;
+   struct si_shader_info *info = &es->selector->info;
+   LLVMValueRef lds_base = NULL;
+   unsigned chan;
+   int i;
+
+   if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
+      unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
+      LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+      LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+      vertex_idx =
+         LLVMBuildOr(ctx->ac.builder, vertex_idx,
+                     LLVMBuildMul(ctx->ac.builder, wave_idx,
+                                  LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
+                     "");
+      lds_base =
+         LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
+   }
+
+   for (i = 0; i < info->num_outputs; i++) {
+      int param;
+
+      if (info->output_semantic[i] == VARYING_SLOT_VIEWPORT ||
+          info->output_semantic[i] == VARYING_SLOT_LAYER)
+         continue;
+
+      param = si_shader_io_get_unique_index(info->output_semantic[i], false);
+
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)))
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+         out_val = ac_to_integer(&ctx->ac, out_val);
+
+         /* GFX9 has the ESGS ring in LDS. */
+         if (ctx->screen->info.chip_class >= GFX9) {
+            LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
+            idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
+            ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
+            continue;
+         }
+
+         ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
+                                     ac_get_arg(&ctx->ac, ctx->es2gs_offset),
+                                     (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
+      }
+   }
+
+   if (ctx->screen->info.chip_class >= GFX9)
+      si_set_es_return_value_for_gs(ctx);
  }
  
  static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
  {
-       if (ctx->screen->info.chip_class >= GFX9)
-               return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
-       else
-               return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
+   if (ctx->screen->info.chip_class >= GFX9)
+      return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
+   else
+      return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
  }
  
  static void emit_gs_epilogue(struct si_shader_context *ctx)
  {
-       if (ctx->shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_epilogue(ctx);
-               return;
-       }
+   if (ctx->shader->key.as_ngg) {
+      gfx10_ngg_gs_emit_epilogue(ctx);
+      return;
+   }
  
-       if (ctx->screen->info.chip_class >= GFX10)
-               LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
+   if (ctx->screen->info.chip_class >= GFX10)
+      LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
  
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
-                        si_get_gs_wave_id(ctx));
+   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
  
-       if (ctx->screen->info.chip_class >= GFX9)
-               ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+   if (ctx->screen->info.chip_class >= GFX9)
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
  }
  
-static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
-                                    unsigned max_outputs,
-                                    LLVMValueRef *addrs)
+static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                                     LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
  
-       assert(info->num_outputs <= max_outputs);
+   assert(info->num_outputs <= max_outputs);
  
-       emit_gs_epilogue(ctx);
+   emit_gs_epilogue(ctx);
  }
  
  /* Emit one vertex from the geometry shader */
-static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
-                               unsigned stream,
-                               LLVMValueRef *addrs)
+static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-       if (ctx->shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
-               return;
-       }
-
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       struct si_shader *shader = ctx->shader;
-       LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
-       LLVMValueRef gs_next_vertex;
-       LLVMValueRef can_emit;
-       unsigned chan, offset;
-       int i;
-
-       /* Write vertex attribute values to GSVS ring */
-       gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
-                                      ctx->gs_next_vertex[stream],
-                                      "");
-
-       /* If this thread has already emitted the declared maximum number of
-        * vertices, skip the write: excessive vertex emissions are not
-        * supposed to have any effect.
-        *
-        * If the shader has no writes to memory, kill it instead. This skips
-        * further memory loads and may allow LLVM to skip to the end
-        * altogether.
-        */
-       can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
-                                LLVMConstInt(ctx->i32,
-                                             shader->selector->gs_max_out_vertices, 0), "");
-
-       bool use_kill = !info->writes_memory;
-       if (use_kill) {
-               ac_build_kill_if_false(&ctx->ac, can_emit);
-       } else {
-               ac_build_ifcc(&ctx->ac, can_emit, 6505);
-       }
-
-       offset = 0;
-       for (i = 0; i < info->num_outputs; i++) {
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)) ||
-                           ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
-                               continue;
-
-                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-                       LLVMValueRef voffset =
-                               LLVMConstInt(ctx->i32, offset *
-                                            shader->selector->gs_max_out_vertices, 0);
-                       offset++;
-
-                       voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
-                       voffset = LLVMBuildMul(ctx->ac.builder, voffset,
-                                              LLVMConstInt(ctx->i32, 4, 0), "");
-
-                       out_val = ac_to_integer(&ctx->ac, out_val);
-
-                       ac_build_buffer_store_dword(&ctx->ac,
-                                                   ctx->gsvs_ring[stream],
-                                                   out_val, 1,
-                                                   voffset, soffset, 0,
-                                                   ac_glc | ac_slc | ac_swizzled);
-               }
-       }
-
-       gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
-       LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
-
-       /* Signal vertex emission if vertex data was written. */
-       if (offset) {
-               ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-                                si_get_gs_wave_id(ctx));
-       }
-
-       if (!use_kill)
-               ac_build_endif(&ctx->ac, 6505);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+   if (ctx->shader->key.as_ngg) {
+      gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+      return;
+   }
+
+   struct si_shader_info *info = &ctx->shader->selector->info;
+   struct si_shader *shader = ctx->shader;
+   LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
+   LLVMValueRef gs_next_vertex;
+   LLVMValueRef can_emit;
+   unsigned chan, offset;
+   int i;
+
+   /* Write vertex attribute values to GSVS ring */
+   gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
+
+   /* If this thread has already emitted the declared maximum number of
+    * vertices, skip the write: excessive vertex emissions are not
+    * supposed to have any effect.
+    *
+    * If the shader has no writes to memory, kill it instead. This skips
+    * further memory loads and may allow LLVM to skip to the end
+    * altogether.
+    */
+   can_emit =
+      LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
+                    LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), "");
+
+   bool use_kill = !info->base.writes_memory;
+   if (use_kill) {
+      ac_build_kill_if_false(&ctx->ac, can_emit);
+   } else {
+      ac_build_ifcc(&ctx->ac, can_emit, 6505);
+   }
+
+   offset = 0;
+   for (i = 0; i < info->num_outputs; i++) {
+      for (chan = 0; chan < 4; chan++) {
+         if (!(info->output_usagemask[i] & (1 << chan)) ||
+             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+            continue;
+
+         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+         LLVMValueRef voffset =
+            LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0);
+         offset++;
+
+         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+         voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+         out_val = ac_to_integer(&ctx->ac, out_val);
+
+         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
+                                     0, ac_glc | ac_slc | ac_swizzled);
+      }
+   }
+
+   gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
+   LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
+
+   /* Signal vertex emission if vertex data was written. */
+   if (offset) {
+      ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+                       si_get_gs_wave_id(ctx));
+   }
+
+   if (!use_kill)
+      ac_build_endif(&ctx->ac, 6505);
  }
  
  /* Cut one primitive from the geometry shader */
-static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
-                                  unsigned stream)
+static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
  {
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
  
-       if (ctx->shader->key.as_ngg) {
-               LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
-               return;
-       }
+   if (ctx->shader->key.as_ngg) {
+      LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
+      return;
+   }
  
-       /* Signal primitive cut */
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
-                        si_get_gs_wave_id(ctx));
+   /* Signal primitive cut */
+   ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
+                    si_get_gs_wave_id(ctx));
  }
  
  void si_preload_esgs_ring(struct si_shader_context *ctx)
  {
-       if (ctx->screen->info.chip_class <= GFX8) {
-               unsigned ring =
-                       ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
-                                                         : SI_ES_RING_ESGS;
-               LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
-               LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-
-               ctx->esgs_ring =
-                       ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-       } else {
-               if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
-                       /* Declare the ESGS ring as an explicit LDS symbol. */
-                       si_llvm_declare_esgs_ring(ctx);
-               } else {
-                       ac_declare_lds_as_pointer(&ctx->ac);
-                       ctx->esgs_ring = ctx->ac.lds;
-               }
-       }
+   if (ctx->screen->info.chip_class <= GFX8) {
+      unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
+      LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
+      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+      ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+   } else {
+      if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+         /* Declare the ESGS ring as an explicit LDS symbol. */
+         si_llvm_declare_esgs_ring(ctx);
+      } else {
+         ac_declare_lds_as_pointer(&ctx->ac);
+         ctx->esgs_ring = ctx->ac.lds;
+      }
+   }
  }
  
  void si_preload_gs_rings(struct si_shader_context *ctx)
  {
-       const struct si_shader_selector *sel = ctx->shader->selector;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
-       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-       LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
-       /* The conceptual layout of the GSVS ring is
-        *   v0c0 .. vLv0 v0c1 .. vLc1 ..
-        * but the real memory layout is swizzled across
-        * threads:
-        *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
-        *   t16v0c0 ..
-        * Override the buffer descriptor accordingly.
-        */
-       LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
-       uint64_t stream_offset = 0;
-
-       for (unsigned stream = 0; stream < 4; ++stream) {
-               unsigned num_components;
-               unsigned stride;
-               unsigned num_records;
-               LLVMValueRef ring, tmp;
-
-               num_components = sel->info.num_stream_output_components[stream];
-               if (!num_components)
-                       continue;
-
-               stride = 4 * num_components * sel->gs_max_out_vertices;
-
-               /* Limit on the stride field for <= GFX7. */
-               assert(stride < (1 << 14));
-
-               num_records = ctx->ac.wave_size;
-
-               ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
-               tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
-               tmp = LLVMBuildAdd(builder, tmp,
-                                  LLVMConstInt(ctx->i64,
-                                               stream_offset, 0), "");
-               stream_offset += stride * ctx->ac.wave_size;
-
-               ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
-               ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
-               tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
-               tmp = LLVMBuildOr(builder, tmp,
-                       LLVMConstInt(ctx->i32,
-                                    S_008F04_STRIDE(stride) |
-                                    S_008F04_SWIZZLE_ENABLE(1), 0), "");
-               ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
-               ring = LLVMBuildInsertElement(builder, ring,
-                               LLVMConstInt(ctx->i32, num_records, 0),
-                               LLVMConstInt(ctx->i32, 2, 0), "");
-
-               uint32_t rsrc3 =
-                               S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                               S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
-                               S_008F0C_ADD_TID_ENABLE(1);
-
-               if (ctx->ac.chip_class >= GFX10) {
-                       rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
-                                S_008F0C_RESOURCE_LEVEL(1);
-               } else {
-                       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-                                S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
-               }
-
-               ring = LLVMBuildInsertElement(builder, ring,
-                       LLVMConstInt(ctx->i32, rsrc3, false),
-                       LLVMConstInt(ctx->i32, 3, 0), "");
-
-               ctx->gsvs_ring[stream] = ring;
-       }
+   const struct si_shader_selector *sel = ctx->shader->selector;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+   LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+   /* The conceptual layout of the GSVS ring is
+    *   v0c0 .. vLv0 v0c1 .. vLc1 ..
+    * but the real memory layout is swizzled across
+    * threads:
+    *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
+    *   t16v0c0 ..
+    * Override the buffer descriptor accordingly.
+    */
+   LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
+   uint64_t stream_offset = 0;
+
+   for (unsigned stream = 0; stream < 4; ++stream) {
+      unsigned num_components;
+      unsigned stride;
+      unsigned num_records;
+      LLVMValueRef ring, tmp;
+
+      num_components = sel->info.num_stream_output_components[stream];
+      if (!num_components)
+         continue;
+
+      stride = 4 * num_components * sel->info.base.gs.vertices_out;
+
+      /* Limit on the stride field for <= GFX7. */
+      assert(stride < (1 << 14));
+
+      num_records = ctx->ac.wave_size;
+
+      ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
+      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
+      tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
+      stream_offset += stride * ctx->ac.wave_size;
+
+      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
+      ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
+      tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
+      tmp = LLVMBuildOr(
+         builder, tmp,
+         LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
+      ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
+      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
+                                    LLVMConstInt(ctx->ac.i32, 2, 0), "");
+
+      uint32_t rsrc3 =
+         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+         S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
+         S_008F0C_ADD_TID_ENABLE(1);
+
+      if (ctx->ac.chip_class >= GFX10) {
+         rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                  S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
+      } else {
+         rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+                  S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
+      }
+
+      ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
+                                    LLVMConstInt(ctx->ac.i32, 3, 0), "");
+
+      ctx->gsvs_ring[stream] = ring;
+   }
  }
  
  /* Generate code for the hardware VS shader stage to go with a geometry shader */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
-                          struct ac_llvm_compiler *compiler,
-                          struct si_shader_selector *gs_selector,
-                          struct pipe_debug_callback *debug)
+struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
+                                             struct ac_llvm_compiler *compiler,
+                                             struct si_shader_selector *gs_selector,
+                                             struct pipe_debug_callback *debug)
  {
-       struct si_shader_context ctx;
-       struct si_shader *shader;
-       LLVMBuilderRef builder;
-       struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
-       struct si_shader_info *gsinfo = &gs_selector->info;
-       int i;
-
-
-       shader = CALLOC_STRUCT(si_shader);
-       if (!shader)
-               return NULL;
-
-       /* We can leave the fence as permanently signaled because the GS copy
-        * shader only becomes visible globally after it has been compiled. */
-       util_queue_fence_init(&shader->ready);
-
-       shader->selector = gs_selector;
-       shader->is_gs_copy_shader = true;
+   struct si_shader_context ctx;
+   struct si_shader *shader;
+   LLVMBuilderRef builder;
+   struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
+   struct si_shader_info *gsinfo = &gs_selector->info;
+   int i;
+
+   shader = CALLOC_STRUCT(si_shader);
+   if (!shader)
+      return NULL;
+
+   /* We can leave the fence as permanently signaled because the GS copy
+    * shader only becomes visible globally after it has been compiled. */
+   util_queue_fence_init(&shader->ready);
+
+   shader->selector = gs_selector;
+   shader->is_gs_copy_shader = true;
+
+   si_llvm_context_init(&ctx, sscreen, compiler,
+                        si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
+                                         false, false, false, false));
+   ctx.shader = shader;
+   ctx.stage = MESA_SHADER_VERTEX;
+
+   builder = ctx.ac.builder;
+
+   si_create_function(&ctx, false);
+
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
+   ctx.gsvs_ring[0] =
+      ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
+
+   LLVMValueRef voffset =
+      LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
+
+   /* Fetch the vertex stream ID.*/
+   LLVMValueRef stream_id;
+
+   if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
+      stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
+   else
+      stream_id = ctx.ac.i32_0;
+
+   /* Fill in output information. */
+   for (i = 0; i < gsinfo->num_outputs; ++i) {
+      outputs[i].semantic = gsinfo->output_semantic[i];
+
+      for (int chan = 0; chan < 4; chan++) {
+         outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
+      }
+   }
+
+   LLVMBasicBlockRef end_bb;
+   LLVMValueRef switch_inst;
+
+   end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
+   switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
+
+   for (int stream = 0; stream < 4; stream++) {
+      LLVMBasicBlockRef bb;
+      unsigned offset;
+
+      if (!gsinfo->num_stream_output_components[stream])
+         continue;
+
+      if (stream > 0 && !gs_selector->so.num_outputs)
+         continue;
+
+      bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
+      LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
+      LLVMPositionBuilderAtEnd(builder, bb);
+
+      /* Fetch vertex data from GSVS ring */
+      offset = 0;
+      for (i = 0; i < gsinfo->num_outputs; ++i) {
+         for (unsigned chan = 0; chan < 4; chan++) {
+            if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
+                outputs[i].vertex_stream[chan] != stream) {
+               outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
+               continue;
+            }
+
+            LLVMValueRef soffset =
+               LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0);
+            offset++;
+
+            outputs[i].values[chan] =
+               ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
+                                    ac_glc | ac_slc, true, false);
+         }
+      }
+
+      /* Streamout and exports. */
+      if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
+         si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
+      }
+
+      if (stream == 0)
+         si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
+
+      LLVMBuildBr(builder, end_bb);
+   }
+
+   LLVMPositionBuilderAtEnd(builder, end_bb);
+
+   LLVMBuildRetVoid(ctx.ac.builder);
+
+   ctx.stage = MESA_SHADER_GEOMETRY; /* override for shader dumping */
+   si_llvm_optimize_module(&ctx);
+
+   bool ok = false;
+   if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
+                       debug, MESA_SHADER_GEOMETRY, "GS Copy Shader", false)) {
+      if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY))
+         fprintf(stderr, "GS Copy Shader:\n");
+      si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
+
+      if (!ctx.shader->config.scratch_bytes_per_wave)
+         ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
+      else
+         ok = true;
+   }
  
-       si_llvm_context_init(&ctx, sscreen, compiler,
-                            si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false));
-       ctx.shader = shader;
-       ctx.type = PIPE_SHADER_VERTEX;
-
-       builder = ctx.ac.builder;
-
-       si_create_function(&ctx);
-
-       LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
-       ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr,
-                                                LLVMConstInt(ctx.i32, SI_RING_GSVS, 0));
-
-       LLVMValueRef voffset =
-               LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
-                            LLVMConstInt(ctx.i32, 4, 0), "");
-
-       /* Fetch the vertex stream ID.*/
-       LLVMValueRef stream_id;
-
-       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
-               stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
-       else
-               stream_id = ctx.i32_0;
-
-       /* Fill in output information. */
-       for (i = 0; i < gsinfo->num_outputs; ++i) {
-               outputs[i].semantic_name = gsinfo->output_semantic_name[i];
-               outputs[i].semantic_index = gsinfo->output_semantic_index[i];
-
-               for (int chan = 0; chan < 4; chan++) {
-                       outputs[i].vertex_stream[chan] =
-                               (gsinfo->output_streams[i] >> (2 * chan)) & 3;
-               }
-       }
-
-       LLVMBasicBlockRef end_bb;
-       LLVMValueRef switch_inst;
-
-       end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
-       switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
-
-       for (int stream = 0; stream < 4; stream++) {
-               LLVMBasicBlockRef bb;
-               unsigned offset;
-
-               if (!gsinfo->num_stream_output_components[stream])
-                       continue;
-
-               if (stream > 0 && !gs_selector->so.num_outputs)
-                       continue;
-
-               bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
-               LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
-               LLVMPositionBuilderAtEnd(builder, bb);
-
-               /* Fetch vertex data from GSVS ring */
-               offset = 0;
-               for (i = 0; i < gsinfo->num_outputs; ++i) {
-                       for (unsigned chan = 0; chan < 4; chan++) {
-                               if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
-                                   outputs[i].vertex_stream[chan] != stream) {
-                                       outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
-                                       continue;
-                               }
-
-                               LLVMValueRef soffset = LLVMConstInt(ctx.i32,
-                                       offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
-                               offset++;
-
-                               outputs[i].values[chan] =
-                                       ac_build_buffer_load(&ctx.ac,
-                                                            ctx.gsvs_ring[0], 1,
-                                                            ctx.i32_0, voffset,
-                                                            soffset, 0, ac_glc | ac_slc,
-                                                            true, false);
-                       }
-               }
-
-               /* Streamout and exports. */
-               if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
-                       si_llvm_emit_streamout(&ctx, outputs,
-                                              gsinfo->num_outputs,
-                                              stream);
-               }
-
-               if (stream == 0)
-                       si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
-
-               LLVMBuildBr(builder, end_bb);
-       }
-
-       LLVMPositionBuilderAtEnd(builder, end_bb);
-
-       LLVMBuildRetVoid(ctx.ac.builder);
-
-       ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
-       si_llvm_optimize_module(&ctx);
-
-       bool ok = false;
-       if (si_compile_llvm(sscreen, &ctx.shader->binary,
-                           &ctx.shader->config, ctx.compiler,
-                           ctx.ac.module,
-                           debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size,
-                           "GS Copy Shader", false) == 0) {
-               if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
-                       fprintf(stderr, "GS Copy Shader:\n");
-               si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
-
-               if (!ctx.shader->config.scratch_bytes_per_wave)
-                       ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
-               else
-                       ok = true;
-       }
-
-       si_llvm_dispose(&ctx);
-
-       if (!ok) {
-               FREE(shader);
-               shader = NULL;
-       } else {
-               si_fix_resource_usage(sscreen, shader);
-       }
-       return shader;
+   si_llvm_dispose(&ctx);
+
+   if (!ok) {
+      FREE(shader);
+      shader = NULL;
+   } else {
+      si_fix_resource_usage(sscreen, shader);
+   }
+   return shader;
  }
  
  /**
   * Build the GS prolog function. Rotate the input vertices for triangle strips
   * with adjacency.
   */
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
-                            union si_shader_part_key *key)
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
  {
-       unsigned num_sgprs, num_vgprs;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMTypeRef returns[AC_MAX_ARGS];
-       LLVMValueRef func, ret;
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               if (key->gs_prolog.states.gfx9_prev_is_vs)
-                       num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
-               else
-                       num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
-               num_vgprs = 5; /* ES inputs are not needed by GS */
-       } else {
-               num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-               num_vgprs = 8;
-       }
-
-       for (unsigned i = 0; i < num_sgprs; ++i) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               returns[i] = ctx->i32;
-       }
-
-       for (unsigned i = 0; i < num_vgprs; ++i) {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-               returns[num_sgprs + i] = ctx->f32;
-       }
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
-       func = ctx->main_fn;
-
-       /* Set the full EXEC mask for the prolog, because we are only fiddling
-        * with registers here. The main shader part will set the correct EXEC
-        * mask.
-        */
-       if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
-               ac_init_exec_full_mask(&ctx->ac);
-
-       /* Copy inputs to outputs. This should be no-op, as the registers match,
-        * but it will prevent the compiler from overwriting them unintentionally.
-        */
-       ret = ctx->return_value;
-       for (unsigned i = 0; i < num_sgprs; i++) {
-               LLVMValueRef p = LLVMGetParam(func, i);
-               ret = LLVMBuildInsertValue(builder, ret, p, i, "");
-       }
-       for (unsigned i = 0; i < num_vgprs; i++) {
-               LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
-               p = ac_to_float(&ctx->ac, p);
-               ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
-       }
-
-       if (key->gs_prolog.states.tri_strip_adj_fix) {
-               /* Remap the input vertices for every other primitive. */
-               const struct ac_arg gfx6_vtx_params[6] = {
-                       { .used = true, .arg_index = num_sgprs },
-                       { .used = true, .arg_index = num_sgprs + 1 },
-                       { .used = true, .arg_index = num_sgprs + 3 },
-                       { .used = true, .arg_index = num_sgprs + 4 },
-                       { .used = true, .arg_index = num_sgprs + 5 },
-                       { .used = true, .arg_index = num_sgprs + 6 },
-               };
-               const struct ac_arg gfx9_vtx_params[3] = {
-                       { .used = true, .arg_index = num_sgprs },
-                       { .used = true, .arg_index = num_sgprs + 1 },
-                       { .used = true, .arg_index = num_sgprs + 4 },
-               };
-               LLVMValueRef vtx_in[6], vtx_out[6];
-               LLVMValueRef prim_id, rotate;
-
-               if (ctx->screen->info.chip_class >= GFX9) {
-                       for (unsigned i = 0; i < 3; i++) {
-                               vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
-                               vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
-                       }
-               } else {
-                       for (unsigned i = 0; i < 6; i++)
-                               vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
-               }
-
-               prim_id = LLVMGetParam(func, num_sgprs + 2);
-               rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
-
-               for (unsigned i = 0; i < 6; ++i) {
-                       LLVMValueRef base, rotated;
-                       base = vtx_in[i];
-                       rotated = vtx_in[(i + 4) % 6];
-                       vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
-               }
-
-               if (ctx->screen->info.chip_class >= GFX9) {
-                       for (unsigned i = 0; i < 3; i++) {
-                               LLVMValueRef hi, out;
-
-                               hi = LLVMBuildShl(builder, vtx_out[i*2+1],
-                                                 LLVMConstInt(ctx->i32, 16, 0), "");
-                               out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
-                               out = ac_to_float(&ctx->ac, out);
-                               ret = LLVMBuildInsertValue(builder, ret, out,
-                                                          gfx9_vtx_params[i].arg_index, "");
-                       }
-               } else {
-                       for (unsigned i = 0; i < 6; i++) {
-                               LLVMValueRef out;
-
-                               out = ac_to_float(&ctx->ac, vtx_out[i]);
-                               ret = LLVMBuildInsertValue(builder, ret, out,
-                                                          gfx6_vtx_params[i].arg_index, "");
-                       }
-               }
-       }
-
-       LLVMBuildRet(builder, ret);
+   unsigned num_sgprs, num_vgprs;
+   LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMTypeRef returns[AC_MAX_ARGS];
+   LLVMValueRef func, ret;
+
+   memset(&ctx->args, 0, sizeof(ctx->args));
+
+   if (ctx->screen->info.chip_class >= GFX9) {
+      if (key->gs_prolog.states.gfx9_prev_is_vs)
+         num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
+      else
+         num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
+      num_vgprs = 5; /* ES inputs are not needed by GS */
+   } else {
+      num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+      num_vgprs = 8;
+   }
+
+   for (unsigned i = 0; i < num_sgprs; ++i) {
+      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+      returns[i] = ctx->ac.i32;
+   }
+
+   for (unsigned i = 0; i < num_vgprs; ++i) {
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+      returns[num_sgprs + i] = ctx->ac.f32;
+   }
+
+   /* Create the function. */
+   si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
+   func = ctx->main_fn;
+
+   /* Set the full EXEC mask for the prolog, because we are only fiddling
+    * with registers here. The main shader part will set the correct EXEC
+    * mask.
+    */
+   if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
+      ac_init_exec_full_mask(&ctx->ac);
+
+   /* Copy inputs to outputs. This should be no-op, as the registers match,
+    * but it will prevent the compiler from overwriting them unintentionally.
+    */
+   ret = ctx->return_value;
+   for (unsigned i = 0; i < num_sgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, i);
+      ret = LLVMBuildInsertValue(builder, ret, p, i, "");
+   }
+   for (unsigned i = 0; i < num_vgprs; i++) {
+      LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
+      p = ac_to_float(&ctx->ac, p);
+      ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
+   }
+
+   if (key->gs_prolog.states.tri_strip_adj_fix) {
+      /* Remap the input vertices for every other primitive. */
+      const struct ac_arg gfx6_vtx_params[6] = {
+         {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
+         {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
+         {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
+      };
+      const struct ac_arg gfx9_vtx_params[3] = {
+         {.used = true, .arg_index = num_sgprs},
+         {.used = true, .arg_index = num_sgprs + 1},
+         {.used = true, .arg_index = num_sgprs + 4},
+      };
+      LLVMValueRef vtx_in[6], vtx_out[6];
+      LLVMValueRef prim_id, rotate;
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         for (unsigned i = 0; i < 3; i++) {
+            vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
+            vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
+         }
+      } else {
+         for (unsigned i = 0; i < 6; i++)
+            vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
+      }
+
+      prim_id = LLVMGetParam(func, num_sgprs + 2);
+      rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
+
+      for (unsigned i = 0; i < 6; ++i) {
+         LLVMValueRef base, rotated;
+         base = vtx_in[i];
+         rotated = vtx_in[(i + 4) % 6];
+         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
+      }
+
+      if (ctx->screen->info.chip_class >= GFX9) {
+         for (unsigned i = 0; i < 3; i++) {
+            LLVMValueRef hi, out;
+
+            hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
+            out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
+            out = ac_to_float(&ctx->ac, out);
+            ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
+         }
+      } else {
+         for (unsigned i = 0; i < 6; i++) {
+            LLVMValueRef out;
+
+            out = ac_to_float(&ctx->ac, vtx_out[i]);
+            ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
+         }
+      }
+   }
+
+   LLVMBuildRet(builder, ret);
  }
  
  void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
  {
-       ctx->abi.load_inputs = si_nir_load_input_gs;
-       ctx->abi.emit_vertex = si_llvm_emit_vertex;
-       ctx->abi.emit_primitive = si_llvm_emit_primitive;
-       ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
+   ctx->abi.load_inputs = si_nir_load_input_gs;
+   ctx->abi.emit_vertex = si_llvm_emit_vertex;
+   ctx->abi.emit_primitive = si_llvm_emit_primitive;
+   ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
  }