radeonsi: move geometry shader code into si_shader_llvm_gs.c
authorMarek Olšák <marek.olsak@amd.com>
Wed, 15 Jan 2020 01:03:48 +0000 (20:03 -0500)
committerMarge Bot <eric+marge@anholt.net>
Wed, 15 Jan 2020 21:54:55 +0000 (21:54 +0000)
Reviewed-by: Timothy Arceri <tarceri@itsqueeze.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3399>

src/gallium/drivers/radeonsi/Makefile.sources
src/gallium/drivers/radeonsi/meson.build
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_shader_internal.h
src/gallium/drivers/radeonsi/si_shader_llvm_build.c
src/gallium/drivers/radeonsi/si_shader_llvm_gs.c [new file with mode: 0644]

index eca13c29784c064d20540974250e4b89446fb901..152e8c746898efecd0aba9c430456a48a37e0a70 100644 (file)
@@ -37,6 +37,7 @@ C_SOURCES := \
        si_shader_internal.h \
        si_shader_llvm.c \
        si_shader_llvm_build.c \
+       si_shader_llvm_gs.c \
        si_shader_llvm_ps.c \
        si_shader_llvm_tess.c \
        si_shader_nir.c \
index cca69c58b2c20e0f3cde3cd6ad3ecf6e9d9a4a09..9bf63f57b2c6371614f84b275d391d99dc714bca 100644 (file)
@@ -52,6 +52,7 @@ files_libradeonsi = files(
   'si_shader_internal.h',
   'si_shader_llvm.c',
   'si_shader_llvm_build.c',
+  'si_shader_llvm_gs.c',
   'si_shader_llvm_ps.c',
   'si_shader_llvm_tess.c',
   'si_shader_nir.c',
index cd352ec4cb2e8132ab8b7c16c7c11b72d65e6f90..ab6751f44b494d88c5a6336005eb310660bd5321 100644 (file)
@@ -49,8 +49,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
 
 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key);
-static void si_fix_resource_usage(struct si_screen *sscreen,
-                                 struct si_shader *shader);
 
 /** Whether the shader runs as a combination of multiple API shaders */
 static bool is_multi_part_shader(struct si_shader_context *ctx)
@@ -428,122 +426,6 @@ LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
        }
 }
 
-static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
-                                         unsigned input_index,
-                                         unsigned vtx_offset_param,
-                                         LLVMTypeRef type,
-                                         unsigned swizzle)
-{
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *shader = ctx->shader;
-       LLVMValueRef vtx_offset, soffset;
-       struct si_shader_info *info = &shader->selector->info;
-       unsigned semantic_name = info->input_semantic_name[input_index];
-       unsigned semantic_index = info->input_semantic_index[input_index];
-       unsigned param;
-       LLVMValueRef value;
-
-       param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
-
-       /* GFX9 has the ESGS ring in LDS. */
-       if (ctx->screen->info.chip_class >= GFX9) {
-               unsigned index = vtx_offset_param;
-
-               switch (index / 2) {
-               case 0:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               case 1:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               case 2:
-                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
-                                                    index % 2 ? 16 : 0, 16);
-                       break;
-               default:
-                       assert(0);
-                       return NULL;
-               }
-
-               unsigned offset = param * 4 + swizzle;
-               vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
-                                         LLVMConstInt(ctx->i32, offset, false), "");
-
-               LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
-               LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
-               if (ac_get_type_size(type) == 64) {
-                       ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
-                                          &ctx->ac.i32_1, 1, "");
-                       LLVMValueRef values[2] = {
-                               value,
-                               LLVMBuildLoad(ctx->ac.builder, ptr, "")
-                       };
-                       value = ac_build_gather_values(&ctx->ac, values, 2);
-               }
-               return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
-       }
-
-       /* GFX6: input load from the ESGS ring in memory. */
-       if (swizzle == ~0) {
-               LLVMValueRef values[4];
-               unsigned chan;
-               for (chan = 0; chan < 4; chan++) {
-                       values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
-                                                            type, chan);
-               }
-               return ac_build_gather_values(&ctx->ac, values, 4);
-       }
-
-       /* Get the vertex offset parameter on GFX6. */
-       LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
-                                               ctx->gs_vtx_offset[vtx_offset_param]);
-
-       vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
-                                 LLVMConstInt(ctx->i32, 4, 0), "");
-
-       soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
-
-       value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
-                                    vtx_offset, soffset, 0, ac_glc, true, false);
-       if (ac_get_type_size(type) == 64) {
-               LLVMValueRef value2;
-               soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
-
-               value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
-                                             ctx->i32_0, vtx_offset, soffset,
-                                             0, ac_glc, true, false);
-               return si_build_gather_64bit(ctx, type, value, value2);
-       }
-       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
-}
-
-static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
-                                        unsigned location,
-                                        unsigned driver_location,
-                                        unsigned component,
-                                        unsigned num_components,
-                                        unsigned vertex_index,
-                                        unsigned const_index,
-                                        LLVMTypeRef type)
-{
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-       LLVMValueRef value[4];
-       for (unsigned i = 0; i < num_components; i++) {
-               unsigned offset = i;
-               if (ac_get_type_size(type) == 64)
-                       offset *= 2;
-
-               offset += component;
-               value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4 + const_index,
-                                                            vertex_index, type, offset);
-       }
-
-       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
-}
-
 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
 {
        struct si_shader_context *ctx = si_shader_context_from_abi(abi);
@@ -816,9 +698,9 @@ void si_emit_streamout_output(struct si_shader_context *ctx,
  * Write streamout data to buffers for vertex stream @p stream (different
  * vertex streams can occur for GS copy shaders).
  */
-static void si_llvm_emit_streamout(struct si_shader_context *ctx,
-                                  struct si_shader_output_values *outputs,
-                                  unsigned noutput, unsigned stream)
+void si_llvm_emit_streamout(struct si_shader_context *ctx,
+                           struct si_shader_output_values *outputs,
+                           unsigned noutput, unsigned stream)
 {
        struct si_shader_selector *sel = ctx->shader->selector;
        struct pipe_stream_output_info *so = &sel->so;
@@ -1178,141 +1060,6 @@ void si_llvm_export_vs(struct si_shader_context *ctx,
        si_build_param_exports(ctx, outputs, noutput);
 }
 
-/* Pass GS inputs from ES to GS on GFX9. */
-static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
-{
-       LLVMValueRef ret = ctx->return_value;
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
-       ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-       if (ctx->shader->key.as_ngg)
-               ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
-       else
-               ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
-       ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
-
-       ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
-                                 8 + SI_SGPR_RW_BUFFERS);
-       ret = si_insert_input_ptr(ctx, ret,
-                                 ctx->bindless_samplers_and_images,
-                                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
-       if (ctx->screen->use_ngg) {
-               ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
-                                         8 + SI_SGPR_VS_STATE_BITS);
-       }
-
-       unsigned vgpr;
-       if (ctx->type == PIPE_SHADER_VERTEX)
-               vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
-       else
-               vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
-
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
-       ctx->return_value = ret;
-}
-
-static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
-                                    unsigned max_outputs,
-                                    LLVMValueRef *addrs)
-{
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader *es = ctx->shader;
-       struct si_shader_info *info = &es->selector->info;
-       LLVMValueRef lds_base = NULL;
-       unsigned chan;
-       int i;
-
-       if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
-               unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
-               LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
-               LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
-               vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
-                                        LLVMBuildMul(ctx->ac.builder, wave_idx,
-                                                     LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), "");
-               lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
-                                       LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
-       }
-
-       for (i = 0; i < info->num_outputs; i++) {
-               int param;
-
-               if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
-                   info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
-                       continue;
-
-               param = si_shader_io_get_unique_index(info->output_semantic_name[i],
-                                                     info->output_semantic_index[i], false);
-
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)))
-                               continue;
-
-                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-                       out_val = ac_to_integer(&ctx->ac, out_val);
-
-                       /* GFX9 has the ESGS ring in LDS. */
-                       if (ctx->screen->info.chip_class >= GFX9) {
-                               LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false);
-                               idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
-                               ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
-                               continue;
-                       }
-
-                       ac_build_buffer_store_dword(&ctx->ac,
-                                                   ctx->esgs_ring,
-                                                   out_val, 1, NULL,
-                                                   ac_get_arg(&ctx->ac, ctx->es2gs_offset),
-                                                   (4 * param + chan) * 4,
-                                                   ac_glc | ac_slc | ac_swizzled);
-               }
-       }
-
-       if (ctx->screen->info.chip_class >= GFX9)
-               si_set_es_return_value_for_gs(ctx);
-}
-
-static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
-{
-       if (ctx->screen->info.chip_class >= GFX9)
-               return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
-       else
-               return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
-}
-
-static void emit_gs_epilogue(struct si_shader_context *ctx)
-{
-       if (ctx->shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_epilogue(ctx);
-               return;
-       }
-
-       if (ctx->screen->info.chip_class >= GFX10)
-               LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
-
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
-                        si_get_gs_wave_id(ctx));
-
-       if (ctx->screen->info.chip_class >= GFX9)
-               ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-}
-
-static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
-                                    unsigned max_outputs,
-                                    LLVMValueRef *addrs)
-{
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-       struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
-
-       assert(info->num_outputs <= max_outputs);
-
-       emit_gs_epilogue(ctx);
-}
-
 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
                                     unsigned max_outputs,
                                     LLVMValueRef *addrs)
@@ -1389,106 +1136,6 @@ static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
        ctx->return_value = ret;
 }
 
-/* Emit one vertex from the geometry shader */
-static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
-                               unsigned stream,
-                               LLVMValueRef *addrs)
-{
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-       if (ctx->shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
-               return;
-       }
-
-       struct si_shader_info *info = &ctx->shader->selector->info;
-       struct si_shader *shader = ctx->shader;
-       LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
-       LLVMValueRef gs_next_vertex;
-       LLVMValueRef can_emit;
-       unsigned chan, offset;
-       int i;
-
-       /* Write vertex attribute values to GSVS ring */
-       gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
-                                      ctx->gs_next_vertex[stream],
-                                      "");
-
-       /* If this thread has already emitted the declared maximum number of
-        * vertices, skip the write: excessive vertex emissions are not
-        * supposed to have any effect.
-        *
-        * If the shader has no writes to memory, kill it instead. This skips
-        * further memory loads and may allow LLVM to skip to the end
-        * altogether.
-        */
-       can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
-                                LLVMConstInt(ctx->i32,
-                                             shader->selector->gs_max_out_vertices, 0), "");
-
-       bool use_kill = !info->writes_memory;
-       if (use_kill) {
-               ac_build_kill_if_false(&ctx->ac, can_emit);
-       } else {
-               ac_build_ifcc(&ctx->ac, can_emit, 6505);
-       }
-
-       offset = 0;
-       for (i = 0; i < info->num_outputs; i++) {
-               for (chan = 0; chan < 4; chan++) {
-                       if (!(info->output_usagemask[i] & (1 << chan)) ||
-                           ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
-                               continue;
-
-                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-                       LLVMValueRef voffset =
-                               LLVMConstInt(ctx->i32, offset *
-                                            shader->selector->gs_max_out_vertices, 0);
-                       offset++;
-
-                       voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
-                       voffset = LLVMBuildMul(ctx->ac.builder, voffset,
-                                              LLVMConstInt(ctx->i32, 4, 0), "");
-
-                       out_val = ac_to_integer(&ctx->ac, out_val);
-
-                       ac_build_buffer_store_dword(&ctx->ac,
-                                                   ctx->gsvs_ring[stream],
-                                                   out_val, 1,
-                                                   voffset, soffset, 0,
-                                                   ac_glc | ac_slc | ac_swizzled);
-               }
-       }
-
-       gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
-       LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
-
-       /* Signal vertex emission if vertex data was written. */
-       if (offset) {
-               ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-                                si_get_gs_wave_id(ctx));
-       }
-
-       if (!use_kill)
-               ac_build_endif(&ctx->ac, 6505);
-}
-
-/* Cut one primitive from the geometry shader */
-static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
-                                  unsigned stream)
-{
-       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-       if (ctx->shader->key.as_ngg) {
-               LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
-               return;
-       }
-
-       /* Signal primitive cut */
-       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
-                        si_get_gs_wave_id(ctx));
-}
-
 static void declare_streamout_params(struct si_shader_context *ctx,
                                     struct pipe_stream_output_info *so)
 {
@@ -1708,7 +1355,7 @@ void si_add_arg_checked(struct ac_shader_args *args,
        ac_add_arg(args, file, registers, type, arg);
 }
 
-static void create_function(struct si_shader_context *ctx)
+void si_create_function(struct si_shader_context *ctx)
 {
        struct si_shader *shader = ctx->shader;
        LLVMTypeRef returns[AC_MAX_ARGS];
@@ -2106,144 +1753,6 @@ static void create_function(struct si_shader_context *ctx)
        }
 }
 
-/* Ensure that the esgs ring is declared.
- *
- * We declare it with 64KB alignment as a hint that the
- * pointer value will always be 0.
- */
-static void declare_esgs_ring(struct si_shader_context *ctx)
-{
-       if (ctx->esgs_ring)
-               return;
-
-       assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
-
-       ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
-               ctx->ac.module, LLVMArrayType(ctx->i32, 0),
-               "esgs_ring",
-               AC_ADDR_SPACE_LDS);
-       LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
-       LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
-}
-
-/**
- * Load ESGS and GSVS ring buffer resource descriptors and save the variables
- * for later use.
- */
-static void preload_ring_buffers(struct si_shader_context *ctx)
-{
-       LLVMBuilderRef builder = ctx->ac.builder;
-
-       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
-
-       if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) {
-               if (ctx->screen->info.chip_class <= GFX8) {
-                       unsigned ring =
-                               ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
-                                                                 : SI_ES_RING_ESGS;
-                       LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
-
-                       ctx->esgs_ring =
-                               ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-               } else {
-                       if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
-                               /* Declare the ESGS ring as an explicit LDS symbol. */
-                               declare_esgs_ring(ctx);
-                       } else {
-                               ac_declare_lds_as_pointer(&ctx->ac);
-                               ctx->esgs_ring = ctx->ac.lds;
-                       }
-               }
-       }
-
-       if (ctx->shader->is_gs_copy_shader) {
-               LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
-
-               ctx->gsvs_ring[0] =
-                       ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-       } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
-               const struct si_shader_selector *sel = ctx->shader->selector;
-               LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
-               LLVMValueRef base_ring;
-
-               base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
-               /* The conceptual layout of the GSVS ring is
-                *   v0c0 .. vLv0 v0c1 .. vLc1 ..
-                * but the real memory layout is swizzled across
-                * threads:
-                *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
-                *   t16v0c0 ..
-                * Override the buffer descriptor accordingly.
-                */
-               LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
-               uint64_t stream_offset = 0;
-
-               for (unsigned stream = 0; stream < 4; ++stream) {
-                       unsigned num_components;
-                       unsigned stride;
-                       unsigned num_records;
-                       LLVMValueRef ring, tmp;
-
-                       num_components = sel->info.num_stream_output_components[stream];
-                       if (!num_components)
-                               continue;
-
-                       stride = 4 * num_components * sel->gs_max_out_vertices;
-
-                       /* Limit on the stride field for <= GFX7. */
-                       assert(stride < (1 << 14));
-
-                       num_records = ctx->ac.wave_size;
-
-                       ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
-                       tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
-                       tmp = LLVMBuildAdd(builder, tmp,
-                                          LLVMConstInt(ctx->i64,
-                                                       stream_offset, 0), "");
-                       stream_offset += stride * ctx->ac.wave_size;
-
-                       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
-                       ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
-                       tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
-                       tmp = LLVMBuildOr(builder, tmp,
-                               LLVMConstInt(ctx->i32,
-                                            S_008F04_STRIDE(stride) |
-                                            S_008F04_SWIZZLE_ENABLE(1), 0), "");
-                       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
-                       ring = LLVMBuildInsertElement(builder, ring,
-                                       LLVMConstInt(ctx->i32, num_records, 0),
-                                       LLVMConstInt(ctx->i32, 2, 0), "");
-
-                       uint32_t rsrc3 =
-                                       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-                                       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-                                       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-                                       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-                                       S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
-                                       S_008F0C_ADD_TID_ENABLE(1);
-
-                       if (ctx->ac.chip_class >= GFX10) {
-                               rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
-                                        S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
-                                        S_008F0C_RESOURCE_LEVEL(1);
-                       } else {
-                               rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
-                                        S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
-                       }
-
-                       ring = LLVMBuildInsertElement(builder, ring,
-                               LLVMConstInt(ctx->i32, rsrc3, false),
-                               LLVMConstInt(ctx->i32, 3, 0), "");
-
-                       ctx->gsvs_ring[stream] = ring;
-               }
-       } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
-               si_llvm_preload_tes_rings(ctx);
-       }
-}
-
 /* For the UMR disassembler. */
 #define DEBUGGER_END_OF_CODE_MARKER    0xbf9f0000 /* invalid instruction */
 #define DEBUGGER_NUM_MARKERS           5
@@ -2656,16 +2165,16 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
        si_shader_dump_stats(sscreen, shader, file, check_debug_option);
 }
 
-static int si_compile_llvm(struct si_screen *sscreen,
-                          struct si_shader_binary *binary,
-                          struct ac_shader_config *conf,
-                          struct ac_llvm_compiler *compiler,
-                          LLVMModuleRef mod,
-                          struct pipe_debug_callback *debug,
-                          enum pipe_shader_type shader_type,
-                          unsigned wave_size,
-                          const char *name,
-                          bool less_optimized)
+int si_compile_llvm(struct si_screen *sscreen,
+                   struct si_shader_binary *binary,
+                   struct ac_shader_config *conf,
+                   struct ac_llvm_compiler *compiler,
+                   LLVMModuleRef mod,
+                   struct pipe_debug_callback *debug,
+                   enum pipe_shader_type shader_type,
+                   unsigned wave_size,
+                   const char *name,
+                   bool less_optimized)
 {
        unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
 
@@ -2724,155 +2233,6 @@ static int si_compile_llvm(struct si_screen *sscreen,
        return 0;
 }
 
-/* Generate code for the hardware VS shader stage to go with a geometry shader */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
-                          struct ac_llvm_compiler *compiler,
-                          struct si_shader_selector *gs_selector,
-                          struct pipe_debug_callback *debug)
-{
-       struct si_shader_context ctx;
-       struct si_shader *shader;
-       LLVMBuilderRef builder;
-       struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
-       struct si_shader_info *gsinfo = &gs_selector->info;
-       int i;
-
-
-       shader = CALLOC_STRUCT(si_shader);
-       if (!shader)
-               return NULL;
-
-       /* We can leave the fence as permanently signaled because the GS copy
-        * shader only becomes visible globally after it has been compiled. */
-       util_queue_fence_init(&shader->ready);
-
-       shader->selector = gs_selector;
-       shader->is_gs_copy_shader = true;
-
-       si_llvm_context_init(&ctx, sscreen, compiler,
-                            si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false));
-       ctx.shader = shader;
-       ctx.type = PIPE_SHADER_VERTEX;
-
-       builder = ctx.ac.builder;
-
-       create_function(&ctx);
-       preload_ring_buffers(&ctx);
-
-       LLVMValueRef voffset =
-               LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
-                            LLVMConstInt(ctx.i32, 4, 0), "");
-
-       /* Fetch the vertex stream ID.*/
-       LLVMValueRef stream_id;
-
-       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
-               stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
-       else
-               stream_id = ctx.i32_0;
-
-       /* Fill in output information. */
-       for (i = 0; i < gsinfo->num_outputs; ++i) {
-               outputs[i].semantic_name = gsinfo->output_semantic_name[i];
-               outputs[i].semantic_index = gsinfo->output_semantic_index[i];
-
-               for (int chan = 0; chan < 4; chan++) {
-                       outputs[i].vertex_stream[chan] =
-                               (gsinfo->output_streams[i] >> (2 * chan)) & 3;
-               }
-       }
-
-       LLVMBasicBlockRef end_bb;
-       LLVMValueRef switch_inst;
-
-       end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
-       switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
-
-       for (int stream = 0; stream < 4; stream++) {
-               LLVMBasicBlockRef bb;
-               unsigned offset;
-
-               if (!gsinfo->num_stream_output_components[stream])
-                       continue;
-
-               if (stream > 0 && !gs_selector->so.num_outputs)
-                       continue;
-
-               bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
-               LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
-               LLVMPositionBuilderAtEnd(builder, bb);
-
-               /* Fetch vertex data from GSVS ring */
-               offset = 0;
-               for (i = 0; i < gsinfo->num_outputs; ++i) {
-                       for (unsigned chan = 0; chan < 4; chan++) {
-                               if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
-                                   outputs[i].vertex_stream[chan] != stream) {
-                                       outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
-                                       continue;
-                               }
-
-                               LLVMValueRef soffset = LLVMConstInt(ctx.i32,
-                                       offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
-                               offset++;
-
-                               outputs[i].values[chan] =
-                                       ac_build_buffer_load(&ctx.ac,
-                                                            ctx.gsvs_ring[0], 1,
-                                                            ctx.i32_0, voffset,
-                                                            soffset, 0, ac_glc | ac_slc,
-                                                            true, false);
-                       }
-               }
-
-               /* Streamout and exports. */
-               if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
-                       si_llvm_emit_streamout(&ctx, outputs,
-                                              gsinfo->num_outputs,
-                                              stream);
-               }
-
-               if (stream == 0)
-                       si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
-
-               LLVMBuildBr(builder, end_bb);
-       }
-
-       LLVMPositionBuilderAtEnd(builder, end_bb);
-
-       LLVMBuildRetVoid(ctx.ac.builder);
-
-       ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
-       si_llvm_optimize_module(&ctx);
-
-       bool ok = false;
-       if (si_compile_llvm(sscreen, &ctx.shader->binary,
-                           &ctx.shader->config, ctx.compiler,
-                           ctx.ac.module,
-                           debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size,
-                           "GS Copy Shader", false) == 0) {
-               if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
-                       fprintf(stderr, "GS Copy Shader:\n");
-               si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
-
-               if (!ctx.shader->config.scratch_bytes_per_wave)
-                       ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
-               else
-                       ok = true;
-       }
-
-       si_llvm_dispose(&ctx);
-
-       if (!ok) {
-               FREE(shader);
-               shader = NULL;
-       } else {
-               si_fix_resource_usage(sscreen, shader);
-       }
-       return shader;
-}
-
 static void si_dump_shader_key_vs(const struct si_shader_key *key,
                                  const struct si_vs_prolog_bits *prolog,
                                  const char *prefix, FILE *f)
@@ -3052,22 +2412,6 @@ static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
               key->unpack_instance_id_from_vertex_id;
 }
 
-LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
-{
-       /* Return true if the current thread should execute an ES thread. */
-       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                            ac_get_thread_id(&ctx->ac),
-                            si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
-}
-
-LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
-{
-       /* Return true if the current thread should execute a GS thread. */
-       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                            ac_get_thread_id(&ctx->ac),
-                            si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
-}
-
 static bool si_build_main_function(struct si_shader_context *ctx,
                                   struct nir_shader *nir, bool free_nir)
 {
@@ -3102,10 +2446,7 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                        ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
                break;
        case PIPE_SHADER_GEOMETRY:
-               ctx->abi.load_inputs = si_nir_load_input_gs;
-               ctx->abi.emit_vertex = si_llvm_emit_vertex;
-               ctx->abi.emit_primitive = si_llvm_emit_primitive;
-               ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
+               si_llvm_init_gs_callbacks(ctx);
                break;
        case PIPE_SHADER_FRAGMENT:
                si_llvm_init_ps_callbacks(ctx);
@@ -3121,8 +2462,15 @@ static bool si_build_main_function(struct si_shader_context *ctx,
        ctx->abi.load_ubo = load_ubo;
        ctx->abi.load_ssbo = load_ssbo;
 
-       create_function(ctx);
-       preload_ring_buffers(ctx);
+       si_create_function(ctx);
+
+       if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
+               si_preload_esgs_ring(ctx);
+
+       if (ctx->type == PIPE_SHADER_GEOMETRY)
+               si_preload_gs_rings(ctx);
+       else if (ctx->type == PIPE_SHADER_TESS_EVAL)
+               si_llvm_preload_tes_rings(ctx);
 
        if (ctx->type == PIPE_SHADER_TESS_CTRL &&
            sel->info.tessfactors_are_def_in_all_invocs) {
@@ -3172,7 +2520,7 @@ static bool si_build_main_function(struct si_shader_context *ctx,
                 * avoids bank conflicts for SoA accesses.
                 */
                if (!gfx10_is_ngg_passthrough(shader))
-                       declare_esgs_ring(ctx);
+                       si_llvm_declare_esgs_ring(ctx);
 
                /* This is really only needed when streamout and / or vertex
                 * compaction is enabled.
@@ -3324,129 +2672,6 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info,
                shader_out->info.uses_instanceid = true;
 }
 
-/**
- * Build the GS prolog function. Rotate the input vertices for triangle strips
- * with adjacency.
- */
-static void si_build_gs_prolog_function(struct si_shader_context *ctx,
-                                       union si_shader_part_key *key)
-{
-       unsigned num_sgprs, num_vgprs;
-       LLVMBuilderRef builder = ctx->ac.builder;
-       LLVMTypeRef returns[AC_MAX_ARGS];
-       LLVMValueRef func, ret;
-
-       memset(&ctx->args, 0, sizeof(ctx->args));
-
-       if (ctx->screen->info.chip_class >= GFX9) {
-               if (key->gs_prolog.states.gfx9_prev_is_vs)
-                       num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
-               else
-                       num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
-               num_vgprs = 5; /* ES inputs are not needed by GS */
-       } else {
-               num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-               num_vgprs = 8;
-       }
-
-       for (unsigned i = 0; i < num_sgprs; ++i) {
-               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-               returns[i] = ctx->i32;
-       }
-
-       for (unsigned i = 0; i < num_vgprs; ++i) {
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-               returns[num_sgprs + i] = ctx->f32;
-       }
-
-       /* Create the function. */
-       si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
-       func = ctx->main_fn;
-
-       /* Set the full EXEC mask for the prolog, because we are only fiddling
-        * with registers here. The main shader part will set the correct EXEC
-        * mask.
-        */
-       if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
-               ac_init_exec_full_mask(&ctx->ac);
-
-       /* Copy inputs to outputs. This should be no-op, as the registers match,
-        * but it will prevent the compiler from overwriting them unintentionally.
-        */
-       ret = ctx->return_value;
-       for (unsigned i = 0; i < num_sgprs; i++) {
-               LLVMValueRef p = LLVMGetParam(func, i);
-               ret = LLVMBuildInsertValue(builder, ret, p, i, "");
-       }
-       for (unsigned i = 0; i < num_vgprs; i++) {
-               LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
-               p = ac_to_float(&ctx->ac, p);
-               ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
-       }
-
-       if (key->gs_prolog.states.tri_strip_adj_fix) {
-               /* Remap the input vertices for every other primitive. */
-               const struct ac_arg gfx6_vtx_params[6] = {
-                       { .used = true, .arg_index = num_sgprs },
-                       { .used = true, .arg_index = num_sgprs + 1 },
-                       { .used = true, .arg_index = num_sgprs + 3 },
-                       { .used = true, .arg_index = num_sgprs + 4 },
-                       { .used = true, .arg_index = num_sgprs + 5 },
-                       { .used = true, .arg_index = num_sgprs + 6 },
-               };
-               const struct ac_arg gfx9_vtx_params[3] = {
-                       { .used = true, .arg_index = num_sgprs },
-                       { .used = true, .arg_index = num_sgprs + 1 },
-                       { .used = true, .arg_index = num_sgprs + 4 },
-               };
-               LLVMValueRef vtx_in[6], vtx_out[6];
-               LLVMValueRef prim_id, rotate;
-
-               if (ctx->screen->info.chip_class >= GFX9) {
-                       for (unsigned i = 0; i < 3; i++) {
-                               vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
-                               vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
-                       }
-               } else {
-                       for (unsigned i = 0; i < 6; i++)
-                               vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
-               }
-
-               prim_id = LLVMGetParam(func, num_sgprs + 2);
-               rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
-
-               for (unsigned i = 0; i < 6; ++i) {
-                       LLVMValueRef base, rotated;
-                       base = vtx_in[i];
-                       rotated = vtx_in[(i + 4) % 6];
-                       vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
-               }
-
-               if (ctx->screen->info.chip_class >= GFX9) {
-                       for (unsigned i = 0; i < 3; i++) {
-                               LLVMValueRef hi, out;
-
-                               hi = LLVMBuildShl(builder, vtx_out[i*2+1],
-                                                 LLVMConstInt(ctx->i32, 16, 0), "");
-                               out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
-                               out = ac_to_float(&ctx->ac, out);
-                               ret = LLVMBuildInsertValue(builder, ret, out,
-                                                          gfx9_vtx_params[i].arg_index, "");
-                       }
-               } else {
-                       for (unsigned i = 0; i < 6; i++) {
-                               LLVMValueRef out;
-
-                               out = ac_to_float(&ctx->ac, vtx_out[i]);
-                               ret = LLVMBuildInsertValue(builder, ret, out,
-                                                          gfx6_vtx_params[i].arg_index, "");
-                       }
-               }
-       }
-
-       LLVMBuildRet(builder, ret);
-}
-
 /**
  * Given a list of shader part functions, build a wrapper function that
  * runs them in sequence to form a monolithic shader.
@@ -3900,7 +3125,7 @@ int si_compile_shader(struct si_screen *sscreen,
                        gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
                        gs_prolog_key.gs_prolog.is_monolithic = true;
                        gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-                       si_build_gs_prolog_function(&ctx, &gs_prolog_key);
+                       si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
                        gs_prolog = ctx.main_fn;
 
                        /* ES main part */
@@ -3959,7 +3184,7 @@ int si_compile_shader(struct si_screen *sscreen,
 
                        memset(&prolog_key, 0, sizeof(prolog_key));
                        prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-                       si_build_gs_prolog_function(&ctx, &prolog_key);
+                       si_llvm_build_gs_prolog(&ctx, &prolog_key);
                        parts[0] = ctx.main_fn;
 
                        si_build_wrapper_function(&ctx, parts, 2, 1, 0);
@@ -4431,7 +3656,7 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen,
        shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
                                            PIPE_SHADER_GEOMETRY, true,
                                            &prolog_key, compiler, debug,
-                                           si_build_gs_prolog_function,
+                                           si_llvm_build_gs_prolog,
                                            "Geometry Shader Prolog");
        return shader->prolog2 != NULL;
 }
@@ -4722,8 +3947,7 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
                *lds_size = MAX2(*lds_size, 8);
 }
 
-static void si_fix_resource_usage(struct si_screen *sscreen,
-                                 struct si_shader *shader)
+void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
 {
        unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
 
index 7a26083eb49b888790bc8d5db5e2543ecd25f0d9..cfddda02128d3ac7da92ca5ad90d41dc876bbf9d 100644 (file)
@@ -814,11 +814,6 @@ struct si_shader_part {
 };
 
 /* si_shader.c */
-struct si_shader *
-si_generate_gs_copy_shader(struct si_screen *sscreen,
-                          struct ac_llvm_compiler *compiler,
-                          struct si_shader_selector *gs_selector,
-                          struct pipe_debug_callback *debug);
 int si_compile_shader(struct si_screen *sscreen,
                      struct ac_llvm_compiler *compiler,
                      struct si_shader *shader,
@@ -844,6 +839,13 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
 const char *si_get_shader_name(const struct si_shader *shader);
 void si_shader_binary_clean(struct si_shader_binary *binary);
 
+/* si_shader_llvm_gs.c */
+struct si_shader *
+si_generate_gs_copy_shader(struct si_screen *sscreen,
+                          struct ac_llvm_compiler *compiler,
+                          struct si_shader_selector *gs_selector,
+                          struct pipe_debug_callback *debug);
+
 /* si_shader_nir.c */
 void si_nir_scan_shader(const struct nir_shader *nir,
                        struct si_shader_info *info);
index dd3dafe77bfbf6bd59e969703f69970ed87ae754..8279a6826b295b81512a5eaf4b14bcb5f7685092 100644 (file)
@@ -260,6 +260,7 @@ LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
                                   LLVMTypeRef type, LLVMValueRef val1,
                                   LLVMValueRef val2);
 void si_llvm_emit_barrier(struct si_shader_context *ctx);
+void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
 void si_declare_compute_memory(struct si_shader_context *ctx);
 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
                                 unsigned swizzle);
@@ -287,8 +288,6 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
                             struct ac_arg param, unsigned rshift,
                             unsigned bitwidth);
-LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
-LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
                               unsigned num_parts, unsigned main_part,
                               unsigned next_shader_first_part);
@@ -304,6 +303,21 @@ LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueR
                                       struct ac_arg param, unsigned return_index);
 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
                                 struct ac_arg param, unsigned return_index);
+int si_compile_llvm(struct si_screen *sscreen,
+                   struct si_shader_binary *binary,
+                   struct ac_shader_config *conf,
+                   struct ac_llvm_compiler *compiler,
+                   LLVMModuleRef mod,
+                   struct pipe_debug_callback *debug,
+                   enum pipe_shader_type shader_type,
+                   unsigned wave_size,
+                   const char *name,
+                   bool less_optimized);
+void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
+void si_llvm_emit_streamout(struct si_shader_context *ctx,
+                           struct si_shader_output_values *outputs,
+                           unsigned noutput, unsigned stream);
+void si_create_function(struct si_shader_context *ctx);
 
 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
                             unsigned max_outputs,
@@ -315,6 +329,17 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
 void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
 
+/* si_shader_llvm_gs.c */
+LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
+LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                             LLVMValueRef *addrs);
+void si_preload_esgs_ring(struct si_shader_context *ctx);
+void si_preload_gs_rings(struct si_shader_context *ctx);
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
+                            union si_shader_part_key *key);
+void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
+
 /* si_shader_llvm_tess.c */
 void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
 void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
index 50b02abb45d8694a3869e057d430defb04466fd3..8d0a841973ca014c901c83dfe309d36c07d4d3b6 100644 (file)
@@ -242,3 +242,23 @@ void si_llvm_emit_barrier(struct si_shader_context *ctx)
 
        ac_build_s_barrier(&ctx->ac);
 }
+
+/* Ensure that the esgs ring is declared.
+ *
+ * We declare it with 64KB alignment as a hint that the
+ * pointer value will always be 0.
+ */
+void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
+{
+       if (ctx->esgs_ring)
+               return;
+
+       assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
+
+       ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
+               ctx->ac.module, LLVMArrayType(ctx->i32, 0),
+               "esgs_ring",
+               AC_ADDR_SPACE_LDS);
+       LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
+       LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
+}
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
new file mode 100644 (file)
index 0000000..69dc708
--- /dev/null
@@ -0,0 +1,780 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_shader_internal.h"
+#include "si_pipe.h"
+#include "sid.h"
+#include "util/u_memory.h"
+
+LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute an ES thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+}
+
+LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute a GS thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+}
+
+static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
+                                         unsigned input_index,
+                                         unsigned vtx_offset_param,
+                                         LLVMTypeRef type,
+                                         unsigned swizzle)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+       struct si_shader *shader = ctx->shader;
+       LLVMValueRef vtx_offset, soffset;
+       struct si_shader_info *info = &shader->selector->info;
+       unsigned semantic_name = info->input_semantic_name[input_index];
+       unsigned semantic_index = info->input_semantic_index[input_index];
+       unsigned param;
+       LLVMValueRef value;
+
+       param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
+
+       /* GFX9 has the ESGS ring in LDS. */
+       if (ctx->screen->info.chip_class >= GFX9) {
+               unsigned index = vtx_offset_param;
+
+               switch (index / 2) {
+               case 0:
+                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
+                                                    index % 2 ? 16 : 0, 16);
+                       break;
+               case 1:
+                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
+                                                    index % 2 ? 16 : 0, 16);
+                       break;
+               case 2:
+                       vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
+                                                    index % 2 ? 16 : 0, 16);
+                       break;
+               default:
+                       assert(0);
+                       return NULL;
+               }
+
+               unsigned offset = param * 4 + swizzle;
+               vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
+                                         LLVMConstInt(ctx->i32, offset, false), "");
+
+               LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
+               LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+               if (ac_get_type_size(type) == 64) {
+                       ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
+                                          &ctx->ac.i32_1, 1, "");
+                       LLVMValueRef values[2] = {
+                               value,
+                               LLVMBuildLoad(ctx->ac.builder, ptr, "")
+                       };
+                       value = ac_build_gather_values(&ctx->ac, values, 2);
+               }
+               return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+       }
+
+       /* GFX6: input load from the ESGS ring in memory. */
+       if (swizzle == ~0) {
+               LLVMValueRef values[4];
+               unsigned chan;
+               for (chan = 0; chan < 4; chan++) {
+                       values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
+                                                            type, chan);
+               }
+               return ac_build_gather_values(&ctx->ac, values, 4);
+       }
+
+       /* Get the vertex offset parameter on GFX6. */
+       LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
+                                               ctx->gs_vtx_offset[vtx_offset_param]);
+
+       vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
+                                 LLVMConstInt(ctx->i32, 4, 0), "");
+
+       soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
+
+       value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
+                                    vtx_offset, soffset, 0, ac_glc, true, false);
+       if (ac_get_type_size(type) == 64) {
+               LLVMValueRef value2;
+               soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
+
+               value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
+                                             ctx->i32_0, vtx_offset, soffset,
+                                             0, ac_glc, true, false);
+               return si_build_gather_64bit(ctx, type, value, value2);
+       }
+       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
+}
+
+static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
+                                        unsigned location,
+                                        unsigned driver_location,
+                                        unsigned component,
+                                        unsigned num_components,
+                                        unsigned vertex_index,
+                                        unsigned const_index,
+                                        LLVMTypeRef type)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+       LLVMValueRef value[4];
+       for (unsigned i = 0; i < num_components; i++) {
+               unsigned offset = i;
+               if (ac_get_type_size(type) == 64)
+                       offset *= 2;
+
+               offset += component;
+               value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4 + const_index,
+                                                            vertex_index, type, offset);
+       }
+
+       return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
+}
+
+/* Pass GS inputs from ES to GS on GFX9. */
+static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
+{
+       LLVMValueRef ret = ctx->return_value;
+
+       ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
+       ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
+       if (ctx->shader->key.as_ngg)
+               ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
+       else
+               ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
+       ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
+       ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
+
+       ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
+                                 8 + SI_SGPR_RW_BUFFERS);
+       ret = si_insert_input_ptr(ctx, ret,
+                                 ctx->bindless_samplers_and_images,
+                                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
+       if (ctx->screen->use_ngg) {
+               ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
+                                         8 + SI_SGPR_VS_STATE_BITS);
+       }
+
+       unsigned vgpr;
+       if (ctx->type == PIPE_SHADER_VERTEX)
+               vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
+       else
+               vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
+
+       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
+       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
+       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
+       ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
+       ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
+       ctx->return_value = ret;
+}
+
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+                             LLVMValueRef *addrs)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+       struct si_shader *es = ctx->shader;
+       struct si_shader_info *info = &es->selector->info;
+       LLVMValueRef lds_base = NULL;
+       unsigned chan;
+       int i;
+
+       if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
+               unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
+               LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+               LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
+               vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
+                                        LLVMBuildMul(ctx->ac.builder, wave_idx,
+                                                     LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), "");
+               lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
+                                       LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
+       }
+
+       for (i = 0; i < info->num_outputs; i++) {
+               int param;
+
+               if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
+                   info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
+                       continue;
+
+               param = si_shader_io_get_unique_index(info->output_semantic_name[i],
+                                                     info->output_semantic_index[i], false);
+
+               for (chan = 0; chan < 4; chan++) {
+                       if (!(info->output_usagemask[i] & (1 << chan)))
+                               continue;
+
+                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+                       out_val = ac_to_integer(&ctx->ac, out_val);
+
+                       /* GFX9 has the ESGS ring in LDS. */
+                       if (ctx->screen->info.chip_class >= GFX9) {
+                               LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false);
+                               idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
+                               ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
+                               continue;
+                       }
+
+                       ac_build_buffer_store_dword(&ctx->ac,
+                                                   ctx->esgs_ring,
+                                                   out_val, 1, NULL,
+                                                   ac_get_arg(&ctx->ac, ctx->es2gs_offset),
+                                                   (4 * param + chan) * 4,
+                                                   ac_glc | ac_slc | ac_swizzled);
+               }
+       }
+
+       if (ctx->screen->info.chip_class >= GFX9)
+               si_set_es_return_value_for_gs(ctx);
+}
+
+static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
+{
+       if (ctx->screen->info.chip_class >= GFX9)
+               return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
+       else
+               return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
+}
+
+static void emit_gs_epilogue(struct si_shader_context *ctx)
+{
+       if (ctx->shader->key.as_ngg) {
+               gfx10_ngg_gs_emit_epilogue(ctx);
+               return;
+       }
+
+       if (ctx->screen->info.chip_class >= GFX10)
+               LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
+
+       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
+                        si_get_gs_wave_id(ctx));
+
+       if (ctx->screen->info.chip_class >= GFX9)
+               ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+}
+
+static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
+                                    unsigned max_outputs,
+                                    LLVMValueRef *addrs)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+       struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
+
+       assert(info->num_outputs <= max_outputs);
+
+       emit_gs_epilogue(ctx);
+}
+
+/* Emit one vertex from the geometry shader */
+static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
+                               unsigned stream,
+                               LLVMValueRef *addrs)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+       if (ctx->shader->key.as_ngg) {
+               gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
+               return;
+       }
+
+       struct si_shader_info *info = &ctx->shader->selector->info;
+       struct si_shader *shader = ctx->shader;
+       LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
+       LLVMValueRef gs_next_vertex;
+       LLVMValueRef can_emit;
+       unsigned chan, offset;
+       int i;
+
+       /* Write vertex attribute values to GSVS ring */
+       gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
+                                      ctx->gs_next_vertex[stream],
+                                      "");
+
+       /* If this thread has already emitted the declared maximum number of
+        * vertices, skip the write: excessive vertex emissions are not
+        * supposed to have any effect.
+        *
+        * If the shader has no writes to memory, kill it instead. This skips
+        * further memory loads and may allow LLVM to skip to the end
+        * altogether.
+        */
+       can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
+                                LLVMConstInt(ctx->i32,
+                                             shader->selector->gs_max_out_vertices, 0), "");
+
+       bool use_kill = !info->writes_memory;
+       if (use_kill) {
+               ac_build_kill_if_false(&ctx->ac, can_emit);
+       } else {
+               ac_build_ifcc(&ctx->ac, can_emit, 6505);
+       }
+
+       offset = 0;
+       for (i = 0; i < info->num_outputs; i++) {
+               for (chan = 0; chan < 4; chan++) {
+                       if (!(info->output_usagemask[i] & (1 << chan)) ||
+                           ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
+                               continue;
+
+                       LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+                       LLVMValueRef voffset =
+                               LLVMConstInt(ctx->i32, offset *
+                                            shader->selector->gs_max_out_vertices, 0);
+                       offset++;
+
+                       voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+                       voffset = LLVMBuildMul(ctx->ac.builder, voffset,
+                                              LLVMConstInt(ctx->i32, 4, 0), "");
+
+                       out_val = ac_to_integer(&ctx->ac, out_val);
+
+                       ac_build_buffer_store_dword(&ctx->ac,
+                                                   ctx->gsvs_ring[stream],
+                                                   out_val, 1,
+                                                   voffset, soffset, 0,
+                                                   ac_glc | ac_slc | ac_swizzled);
+               }
+       }
+
+       gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
+       LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
+
+       /* Signal vertex emission if vertex data was written. */
+       if (offset) {
+               ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+                                si_get_gs_wave_id(ctx));
+       }
+
+       if (!use_kill)
+               ac_build_endif(&ctx->ac, 6505);
+}
+
+/* Cut one primitive from the geometry shader */
+static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
+                                  unsigned stream)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+       if (ctx->shader->key.as_ngg) {
+               LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
+               return;
+       }
+
+       /* Signal primitive cut */
+       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
+                        si_get_gs_wave_id(ctx));
+}
+
+void si_preload_esgs_ring(struct si_shader_context *ctx)
+{
+       if (ctx->screen->info.chip_class <= GFX8) {
+               unsigned ring =
+                       ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
+                                                         : SI_ES_RING_ESGS;
+               LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
+               LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+
+               ctx->esgs_ring =
+                       ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+       } else {
+               if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
+                       /* Declare the ESGS ring as an explicit LDS symbol. */
+                       si_llvm_declare_esgs_ring(ctx);
+               } else {
+                       ac_declare_lds_as_pointer(&ctx->ac);
+                       ctx->esgs_ring = ctx->ac.lds;
+               }
+       }
+}
+
+void si_preload_gs_rings(struct si_shader_context *ctx)
+{
+       const struct si_shader_selector *sel = ctx->shader->selector;
+       LLVMBuilderRef builder = ctx->ac.builder;
+       LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
+       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+       LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+       /* The conceptual layout of the GSVS ring is
+        *   v0c0 .. vLv0 v0c1 .. vLc1 ..
+        * but the real memory layout is swizzled across
+        * threads:
+        *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
+        *   t16v0c0 ..
+        * Override the buffer descriptor accordingly.
+        */
+       LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
+       uint64_t stream_offset = 0;
+
+       for (unsigned stream = 0; stream < 4; ++stream) {
+               unsigned num_components;
+               unsigned stride;
+               unsigned num_records;
+               LLVMValueRef ring, tmp;
+
+               num_components = sel->info.num_stream_output_components[stream];
+               if (!num_components)
+                       continue;
+
+               stride = 4 * num_components * sel->gs_max_out_vertices;
+
+               /* Limit on the stride field for <= GFX7. */
+               assert(stride < (1 << 14));
+
+               num_records = ctx->ac.wave_size;
+
+               ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
+               tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
+               tmp = LLVMBuildAdd(builder, tmp,
+                                  LLVMConstInt(ctx->i64,
+                                               stream_offset, 0), "");
+               stream_offset += stride * ctx->ac.wave_size;
+
+               ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
+               ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
+               tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
+               tmp = LLVMBuildOr(builder, tmp,
+                       LLVMConstInt(ctx->i32,
+                                    S_008F04_STRIDE(stride) |
+                                    S_008F04_SWIZZLE_ENABLE(1), 0), "");
+               ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
+               ring = LLVMBuildInsertElement(builder, ring,
+                               LLVMConstInt(ctx->i32, num_records, 0),
+                               LLVMConstInt(ctx->i32, 2, 0), "");
+
+               uint32_t rsrc3 =
+                               S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                               S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                               S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                               S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                               S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
+                               S_008F0C_ADD_TID_ENABLE(1);
+
+               if (ctx->ac.chip_class >= GFX10) {
+                       rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
+                                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
+                                S_008F0C_RESOURCE_LEVEL(1);
+               } else {
+                       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+                                S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
+               }
+
+               ring = LLVMBuildInsertElement(builder, ring,
+                       LLVMConstInt(ctx->i32, rsrc3, false),
+                       LLVMConstInt(ctx->i32, 3, 0), "");
+
+               ctx->gsvs_ring[stream] = ring;
+       }
+}
+
+/* Generate code for the hardware VS shader stage to go with a geometry shader */
+struct si_shader *
+si_generate_gs_copy_shader(struct si_screen *sscreen,
+                          struct ac_llvm_compiler *compiler,
+                          struct si_shader_selector *gs_selector,
+                          struct pipe_debug_callback *debug)
+{
+       struct si_shader_context ctx;
+       struct si_shader *shader;
+       LLVMBuilderRef builder;
+       struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
+       struct si_shader_info *gsinfo = &gs_selector->info;
+       int i;
+
+
+       shader = CALLOC_STRUCT(si_shader);
+       if (!shader)
+               return NULL;
+
+       /* We can leave the fence as permanently signaled because the GS copy
+        * shader only becomes visible globally after it has been compiled. */
+       util_queue_fence_init(&shader->ready);
+
+       shader->selector = gs_selector;
+       shader->is_gs_copy_shader = true;
+
+       si_llvm_context_init(&ctx, sscreen, compiler,
+                            si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false));
+       ctx.shader = shader;
+       ctx.type = PIPE_SHADER_VERTEX;
+
+       builder = ctx.ac.builder;
+
+       si_create_function(&ctx);
+
+       LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
+       ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr,
+                                                LLVMConstInt(ctx.i32, SI_RING_GSVS, 0));
+
+       LLVMValueRef voffset =
+               LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
+                            LLVMConstInt(ctx.i32, 4, 0), "");
+
+       /* Fetch the vertex stream ID.*/
+       LLVMValueRef stream_id;
+
+       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
+               stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
+       else
+               stream_id = ctx.i32_0;
+
+       /* Fill in output information. */
+       for (i = 0; i < gsinfo->num_outputs; ++i) {
+               outputs[i].semantic_name = gsinfo->output_semantic_name[i];
+               outputs[i].semantic_index = gsinfo->output_semantic_index[i];
+
+               for (int chan = 0; chan < 4; chan++) {
+                       outputs[i].vertex_stream[chan] =
+                               (gsinfo->output_streams[i] >> (2 * chan)) & 3;
+               }
+       }
+
+       LLVMBasicBlockRef end_bb;
+       LLVMValueRef switch_inst;
+
+       end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
+       switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
+
+       for (int stream = 0; stream < 4; stream++) {
+               LLVMBasicBlockRef bb;
+               unsigned offset;
+
+               if (!gsinfo->num_stream_output_components[stream])
+                       continue;
+
+               if (stream > 0 && !gs_selector->so.num_outputs)
+                       continue;
+
+               bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
+               LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
+               LLVMPositionBuilderAtEnd(builder, bb);
+
+               /* Fetch vertex data from GSVS ring */
+               offset = 0;
+               for (i = 0; i < gsinfo->num_outputs; ++i) {
+                       for (unsigned chan = 0; chan < 4; chan++) {
+                               if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
+                                   outputs[i].vertex_stream[chan] != stream) {
+                                       outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
+                                       continue;
+                               }
+
+                               LLVMValueRef soffset = LLVMConstInt(ctx.i32,
+                                       offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
+                               offset++;
+
+                               outputs[i].values[chan] =
+                                       ac_build_buffer_load(&ctx.ac,
+                                                            ctx.gsvs_ring[0], 1,
+                                                            ctx.i32_0, voffset,
+                                                            soffset, 0, ac_glc | ac_slc,
+                                                            true, false);
+                       }
+               }
+
+               /* Streamout and exports. */
+               if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
+                       si_llvm_emit_streamout(&ctx, outputs,
+                                              gsinfo->num_outputs,
+                                              stream);
+               }
+
+               if (stream == 0)
+                       si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
+
+               LLVMBuildBr(builder, end_bb);
+       }
+
+       LLVMPositionBuilderAtEnd(builder, end_bb);
+
+       LLVMBuildRetVoid(ctx.ac.builder);
+
+       ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
+       si_llvm_optimize_module(&ctx);
+
+       bool ok = false;
+       if (si_compile_llvm(sscreen, &ctx.shader->binary,
+                           &ctx.shader->config, ctx.compiler,
+                           ctx.ac.module,
+                           debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size,
+                           "GS Copy Shader", false) == 0) {
+               if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
+                       fprintf(stderr, "GS Copy Shader:\n");
+               si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
+
+               if (!ctx.shader->config.scratch_bytes_per_wave)
+                       ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
+               else
+                       ok = true;
+       }
+
+       si_llvm_dispose(&ctx);
+
+       if (!ok) {
+               FREE(shader);
+               shader = NULL;
+       } else {
+               si_fix_resource_usage(sscreen, shader);
+       }
+       return shader;
+}
+
+/**
+ * Build the GS prolog function. Rotate the input vertices for triangle strips
+ * with adjacency.
+ */
+void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
+                            union si_shader_part_key *key)
+{
+       unsigned num_sgprs, num_vgprs;
+       LLVMBuilderRef builder = ctx->ac.builder;
+       LLVMTypeRef returns[AC_MAX_ARGS];
+       LLVMValueRef func, ret;
+
+       memset(&ctx->args, 0, sizeof(ctx->args));
+
+       if (ctx->screen->info.chip_class >= GFX9) {
+               if (key->gs_prolog.states.gfx9_prev_is_vs)
+                       num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
+               else
+                       num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
+               num_vgprs = 5; /* ES inputs are not needed by GS */
+       } else {
+               num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+               num_vgprs = 8;
+       }
+
+       for (unsigned i = 0; i < num_sgprs; ++i) {
+               ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
+               returns[i] = ctx->i32;
+       }
+
+       for (unsigned i = 0; i < num_vgprs; ++i) {
+               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+               returns[num_sgprs + i] = ctx->f32;
+       }
+
+       /* Create the function. */
+       si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
+       func = ctx->main_fn;
+
+       /* Set the full EXEC mask for the prolog, because we are only fiddling
+        * with registers here. The main shader part will set the correct EXEC
+        * mask.
+        */
+       if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
+               ac_init_exec_full_mask(&ctx->ac);
+
+       /* Copy inputs to outputs. This should be no-op, as the registers match,
+        * but it will prevent the compiler from overwriting them unintentionally.
+        */
+       ret = ctx->return_value;
+       for (unsigned i = 0; i < num_sgprs; i++) {
+               LLVMValueRef p = LLVMGetParam(func, i);
+               ret = LLVMBuildInsertValue(builder, ret, p, i, "");
+       }
+       for (unsigned i = 0; i < num_vgprs; i++) {
+               LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
+               p = ac_to_float(&ctx->ac, p);
+               ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
+       }
+
+       if (key->gs_prolog.states.tri_strip_adj_fix) {
+               /* Remap the input vertices for every other primitive. */
+               const struct ac_arg gfx6_vtx_params[6] = {
+                       { .used = true, .arg_index = num_sgprs },
+                       { .used = true, .arg_index = num_sgprs + 1 },
+                       { .used = true, .arg_index = num_sgprs + 3 },
+                       { .used = true, .arg_index = num_sgprs + 4 },
+                       { .used = true, .arg_index = num_sgprs + 5 },
+                       { .used = true, .arg_index = num_sgprs + 6 },
+               };
+               const struct ac_arg gfx9_vtx_params[3] = {
+                       { .used = true, .arg_index = num_sgprs },
+                       { .used = true, .arg_index = num_sgprs + 1 },
+                       { .used = true, .arg_index = num_sgprs + 4 },
+               };
+               LLVMValueRef vtx_in[6], vtx_out[6];
+               LLVMValueRef prim_id, rotate;
+
+               if (ctx->screen->info.chip_class >= GFX9) {
+                       for (unsigned i = 0; i < 3; i++) {
+                               vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
+                               vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
+                       }
+               } else {
+                       for (unsigned i = 0; i < 6; i++)
+                               vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
+               }
+
+               prim_id = LLVMGetParam(func, num_sgprs + 2);
+               rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
+
+               for (unsigned i = 0; i < 6; ++i) {
+                       LLVMValueRef base, rotated;
+                       base = vtx_in[i];
+                       rotated = vtx_in[(i + 4) % 6];
+                       vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
+               }
+
+               if (ctx->screen->info.chip_class >= GFX9) {
+                       for (unsigned i = 0; i < 3; i++) {
+                               LLVMValueRef hi, out;
+
+                               hi = LLVMBuildShl(builder, vtx_out[i*2+1],
+                                                 LLVMConstInt(ctx->i32, 16, 0), "");
+                               out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
+                               out = ac_to_float(&ctx->ac, out);
+                               ret = LLVMBuildInsertValue(builder, ret, out,
+                                                          gfx9_vtx_params[i].arg_index, "");
+                       }
+               } else {
+                       for (unsigned i = 0; i < 6; i++) {
+                               LLVMValueRef out;
+
+                               out = ac_to_float(&ctx->ac, vtx_out[i]);
+                               ret = LLVMBuildInsertValue(builder, ret, out,
+                                                          gfx6_vtx_params[i].arg_index, "");
+                       }
+               }
+       }
+
+       LLVMBuildRet(builder, ret);
+}
+
+void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
+{
+       ctx->abi.load_inputs = si_nir_load_input_gs;
+       ctx->abi.emit_vertex = si_llvm_emit_vertex;
+       ctx->abi.emit_primitive = si_llvm_emit_primitive;
+       ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
+}