radeonsi/gfx10: generate VS and TES as NGG merged ESGS shaders

author Nicolai Hähnle <nicolai.haehnle@amd.com>

Thu, 16 Nov 2017 16:00:50 +0000 (17:00 +0100)

committer Marek Olšák <marek.olsak@amd.com>

Wed, 3 Jul 2019 19:51:12 +0000 (15:51 -0400)
author Nicolai Hähnle <nicolai.haehnle@amd.com>
Thu, 16 Nov 2017 16:00:50 +0000 (17:00 +0100)
committer Marek Olšák <marek.olsak@amd.com>
Wed, 3 Jul 2019 19:51:12 +0000 (15:51 -0400)
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources

index ab0ba93b02a6d98f4bcb2c94c31c8c5b880b47d3..83cca397716295b6ff6358685ea0834b1d9b2e2c 100644 (file)
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -6,6 +6,7 @@ C_SOURCES := \
         $(GENERATED_SOURCES) \
         cik_sdma.c \
         driinfo_radeonsi.h \
+       gfx10_shader_ngg.c \
         si_blit.c \
         si_buffer.c \
         si_build_pm4.h \
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

new file mode 100644 (file)

index 0000000..f5774b2
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+
+#include "sid.h"
+
+#include "util/u_memory.h"
+
+static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
+{
+       return si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
+}
+
+static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
+{
+       return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+                           LLVMConstInt(ctx->ac.i32, 12, false),
+                           LLVMConstInt(ctx->ac.i32, 9, false),
+                           false);
+}
+
+static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
+{
+       return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
+                           LLVMConstInt(ctx->ac.i32, 22, false),
+                           LLVMConstInt(ctx->ac.i32, 9, false),
+                           false);
+}
+
+/* Send GS Alloc Req message from the first wave of the group to SPI.
+ * Message payload is:
+ * - bits 0..10: vertices in group
+ * - bits 12..22: primitives in group
+ */
+static void build_sendmsg_gs_alloc_req(struct si_shader_context *ctx,
+                                      LLVMValueRef vtx_cnt,
+                                      LLVMValueRef prim_cnt)
+{
+       LLVMBuilderRef builder = ctx->ac.builder;
+       LLVMValueRef tmp;
+
+       tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
+       ac_build_ifcc(&ctx->ac, tmp, 5020);
+
+       tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
+       tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
+       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
+
+       ac_build_endif(&ctx->ac, 5020);
+}
+
+struct ngg_prim {
+       unsigned num_vertices;
+       LLVMValueRef isnull;
+       LLVMValueRef index[3];
+       LLVMValueRef edgeflag[3];
+};
+
+static void build_export_prim(struct si_shader_context *ctx,
+                             const struct ngg_prim *prim)
+{
+       LLVMBuilderRef builder = ctx->ac.builder;
+       struct ac_export_args args;
+       LLVMValueRef tmp;
+
+       tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
+       args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
+
+       for (unsigned i = 0; i < prim->num_vertices; ++i) {
+               tmp = LLVMBuildShl(builder, prim->index[i],
+                                  LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
+               args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+               tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
+               tmp = LLVMBuildShl(builder, tmp,
+                                  LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
+               args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
+       }
+
+       args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
+       args.out[1] = LLVMGetUndef(ctx->ac.f32);
+       args.out[2] = LLVMGetUndef(ctx->ac.f32);
+       args.out[3] = LLVMGetUndef(ctx->ac.f32);
+
+       args.target = V_008DFC_SQ_EXP_PRIM;
+       args.enabled_channels = 1;
+       args.done = true;
+       args.valid_mask = false;
+       args.compr = false;
+
+       ac_build_export(&ctx->ac, &args);
+}
+
+/**
+ * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
+ */
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
+                            unsigned max_outputs,
+                            LLVMValueRef *addrs)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+       struct tgsi_shader_info *info = &ctx->shader->selector->info;
+       struct si_shader_output_values *outputs = NULL;
+       LLVMBuilderRef builder = ctx->ac.builder;
+       struct lp_build_if_state if_state;
+       LLVMValueRef tmp;
+
+       assert(!ctx->shader->is_gs_copy_shader);
+       assert(info->num_outputs <= max_outputs);
+
+       outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
+
+       for (unsigned i = 0; i < info->num_outputs; i++) {
+               outputs[i].semantic_name = info->output_semantic_name[i];
+               outputs[i].semantic_index = info->output_semantic_index[i];
+
+               /* This is used only by streamout. */
+               for (unsigned j = 0; j < 4; j++) {
+                       outputs[i].values[j] =
+                               LLVMBuildLoad(builder,
+                                             addrs[4 * i + j],
+                                             "");
+                       outputs[i].vertex_stream[j] =
+                               (info->output_streams[i] >> (2 * j)) & 3;
+               }
+       }
+
+       lp_build_endif(&ctx->merged_wrap_if_state);
+
+       LLVMValueRef prims_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+       LLVMValueRef vtx_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
+       LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
+                                                 ac_get_thread_id(&ctx->ac), prims_in_wave, "");
+       LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
+                                                 ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
+       LLVMValueRef vtxindex[] = {
+               si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 0, 16),
+               si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 16, 16),
+               si_unpack_param(ctx, ctx->param_gs_vtx23_offset, 0, 16),
+       };
+
+       /* Determine the number of vertices per primitive. */
+       unsigned num_vertices;
+       LLVMValueRef num_vertices_val;
+
+       if (ctx->type == PIPE_SHADER_VERTEX) {
+               if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]) {
+                       /* Blits always use axis-aligned rectangles with 3 vertices. */
+                       num_vertices = 3;
+                       num_vertices_val = LLVMConstInt(ctx->i32, 3, 0);
+               } else {
+                       /* Extract OUTPRIM field. */
+                       tmp = si_unpack_param(ctx, ctx->param_vs_state_bits, 2, 2);
+                       num_vertices_val = LLVMBuildAdd(builder, tmp, ctx->i32_1, "");
+                       num_vertices = 3; /* TODO: optimize for points & lines */
+               }
+       } else {
+               assert(ctx->type == PIPE_SHADER_TESS_EVAL);
+
+               if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
+                       num_vertices = 1;
+               else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
+                       num_vertices = 2;
+               else
+                       num_vertices = 3;
+
+               num_vertices_val = LLVMConstInt(ctx->i32, num_vertices, false);
+       }
+
+       /* TODO: streamout */
+
+       /* TODO: primitive culling */
+
+       build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
+
+       /* Export primitive data to the index buffer. Format is:
+        *  - bits 0..8: index 0
+        *  - bit 9: edge flag 0
+        *  - bits 10..18: index 1
+        *  - bit 19: edge flag 1
+        *  - bits 20..28: index 2
+        *  - bit 29: edge flag 2
+        *  - bit 31: null primitive (skip)
+        *
+        * For the first version, we will always build up all three indices
+        * independent of the primitive type. The additional garbage data
+        * shouldn't hurt.
+        *
+        * TODO: culling depends on the primitive type, so can have some
+        * interaction here.
+        */
+       lp_build_if(&if_state, &ctx->gallivm, is_gs_thread);
+       {
+               struct ngg_prim prim = {};
+
+               prim.num_vertices = num_vertices;
+               prim.isnull = ctx->ac.i1false;
+               memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
+
+               for (unsigned i = 0; i < num_vertices; ++i) {
+                       tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
+                                           LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
+                       prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+               }
+
+               build_export_prim(ctx, &prim);
+       }
+       lp_build_endif(&if_state);
+
+       /* Export per-vertex data (positions and parameters). */
+       lp_build_if(&if_state, &ctx->gallivm, is_es_thread);
+       {
+               unsigned i;
+
+               /* Unconditionally (re-)load the values for proper SSA form. */
+               for (i = 0; i < info->num_outputs; i++) {
+                       for (unsigned j = 0; j < 4; j++) {
+                               outputs[i].values[j] =
+                                       LLVMBuildLoad(builder,
+                                               addrs[4 * i + j],
+                                               "");
+                       }
+               }
+
+               /* TODO: Vertex shaders have to get PrimitiveID from GS VGPRs. */
+               if (ctx->type == PIPE_SHADER_TESS_EVAL &&
+                   ctx->shader->key.mono.u.vs_export_prim_id) {
+                       outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+                       outputs[i].semantic_index = 0;
+                       outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
+                       for (unsigned j = 1; j < 4; j++)
+                               outputs[i].values[j] = LLVMGetUndef(ctx->f32);
+
+                       memset(outputs[i].vertex_stream, 0,
+                              sizeof(outputs[i].vertex_stream));
+                       i++;
+               }
+
+               si_llvm_export_vs(ctx, outputs, i);
+       }
+       lp_build_endif(&if_state);
+
+       FREE(outputs);
+}
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build

index d733452300d351a7dc12b76a641b7ec5acb52b79..0ca065f34e0cc4c6b0ee7ad315f0a33675cb2c54 100644 (file)
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -21,6 +21,7 @@
  files_libradeonsi = files(
    'cik_sdma.c',
    'driinfo_radeonsi.h',
+  'gfx10_shader_ngg.c',
    'si_blit.c',
    'si_buffer.c',
    'si_build_pm4.h',
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index b6410c624489da57c461f1733e2bc1c91b522e14..2ab1833579ec782e129e7fb7bcaacb86d23badbd 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4412,6 +4412,10 @@ static void declare_streamout_params(struct si_shader_context *ctx,
  static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
  {
         switch (shader->selector->type) {
+       case PIPE_SHADER_VERTEX:
+       case PIPE_SHADER_TESS_EVAL:
+               return shader->key.as_ngg ? 128 : 0;
+
         case PIPE_SHADER_TESS_CTRL:
                 /* Return this so that LLVM doesn't remove s_barrier
                  * instructions on chips where we use s_barrier. */
@@ -4582,7 +4586,7 @@ static void create_function(struct si_shader_context *ctx)
         if (ctx->screen->info.chip_class >= GFX9) {
                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
-               else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
+               else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
         }
  
@@ -4708,7 +4712,12 @@ static void create_function(struct si_shader_context *ctx)
                 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
                 declare_per_stage_desc_pointers(ctx, &fninfo,
                                                 ctx->type == PIPE_SHADER_GEOMETRY);
-               ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+
+               if (ctx->shader->key.as_ngg)
+                       add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->gs_tg_info);
+               else
+                       ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+
                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4716,11 +4725,17 @@ static void create_function(struct si_shader_context *ctx)
                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
  
                 declare_global_desc_pointers(ctx, &fninfo);
-               declare_per_stage_desc_pointers(ctx, &fninfo,
-                                               (ctx->type == PIPE_SHADER_VERTEX ||
-                                                ctx->type == PIPE_SHADER_TESS_EVAL));
+               if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
+                       declare_per_stage_desc_pointers(ctx, &fninfo,
+                                                       (ctx->type == PIPE_SHADER_VERTEX ||
+                                                        ctx->type == PIPE_SHADER_TESS_EVAL));
+               }
+
                 if (ctx->type == PIPE_SHADER_VERTEX) {
-                       declare_vs_specific_input_sgprs(ctx, &fninfo);
+                       if (vs_blit_property)
+                               declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);
+                       else
+                               declare_vs_specific_input_sgprs(ctx, &fninfo);
                 } else {
                         ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                         ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4747,8 +4762,9 @@ static void create_function(struct si_shader_context *ctx)
                         declare_tes_input_vgprs(ctx, &fninfo);
                 }
  
-               if (ctx->type == PIPE_SHADER_VERTEX ||
-                   ctx->type == PIPE_SHADER_TESS_EVAL) {
+               if (ctx->shader->key.as_es &&
+                   (ctx->type == PIPE_SHADER_VERTEX ||
+                    ctx->type == PIPE_SHADER_TESS_EVAL)) {
                         unsigned num_user_sgprs;
  
                         if (ctx->type == PIPE_SHADER_VERTEX)
@@ -5925,6 +5941,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
                 else if (shader->key.opt.vs_as_prim_discard_cs)
                         ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+               else if (shader->key.as_ngg)
+                       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
                 else
                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
@@ -5948,8 +5966,12 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
                 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
                 if (shader->key.as_es)
                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-               else
-                       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+               else {
+                       if (shader->key.as_ngg)
+                               ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+                       else
+                               ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+               }
                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
                 break;
         case PIPE_SHADER_GEOMETRY:
@@ -5994,6 +6016,10 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
          *
          * For monolithic merged shaders, the first shader is wrapped in an
          * if-block together with its prolog in si_build_wrapper_function.
+        *
+        * NGG vertex and tess eval shaders running as the last
+        * vertex/geometry stage handle execution explicitly using
+        * if-statements.
          */
         if (ctx->screen->info.chip_class >= GFX9) {
                 if (!shader->is_monolithic &&
@@ -6005,28 +6031,50 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
                         si_init_exec_from_input(ctx,
                                                 ctx->param_merged_wave_info, 0);
                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
-                          ctx->type == PIPE_SHADER_GEOMETRY) {
+                          ctx->type == PIPE_SHADER_GEOMETRY ||
+                          shader->key.as_ngg) {
+                       LLVMValueRef num_threads;
+                       bool nested_barrier;
+
                         if (!shader->is_monolithic)
                                 ac_init_exec_full_mask(&ctx->ac);
  
-                       LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+                       if (ctx->type == PIPE_SHADER_TESS_CTRL ||
+                           ctx->type == PIPE_SHADER_GEOMETRY) {
+                               /* Number of patches / primitives */
+                               num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+                               nested_barrier = true;
+                       } else {
+                               /* Number of vertices */
+                               num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
+                               nested_barrier = false;
+                       }
+
                         LLVMValueRef ena =
                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
                                             ac_get_thread_id(&ctx->ac), num_threads, "");
                         lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
  
-                       /* The barrier must execute for all shaders in a
-                        * threadgroup.
-                        *
-                        * Execute the barrier inside the conditional block,
-                        * so that empty waves can jump directly to s_endpgm,
-                        * which will also signal the barrier.
-                        *
-                        * If the shader is TCS and the TCS epilog is present
-                        * and contains a barrier, it will wait there and then
-                        * reach s_endpgm.
-                        */
-                       si_llvm_emit_barrier(NULL, bld_base, NULL);
+                       if (nested_barrier) {
+                               /* Execute a barrier before the second shader in
+                                * a merged shader.
+                                *
+                                * Execute the barrier inside the conditional block,
+                                * so that empty waves can jump directly to s_endpgm,
+                                * which will also signal the barrier.
+                                *
+                                * This is possible in gfx9, because an empty wave
+                                * for the second shader does not participate in
+                                * the epilogue. With NGG, empty waves may still
+                                * be required to export data (e.g. GS output vertices),
+                                * so we cannot let them exit early.
+                                *
+                                * If the shader is TCS and the TCS epilog is present
+                                * and contains a barrier, it will wait there and then
+                                * reach s_endpgm.
+                                */
+                               si_llvm_emit_barrier(NULL, bld_base, NULL);
+                       }
                 }
         }
  
@@ -6099,6 +6147,8 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
         } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
                 key->vs_prolog.as_es = 1;
                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
+       } else if (shader_out->key.as_ngg) {
+               key->vs_prolog.num_merged_next_stage_vgprs = 5;
         }
  
         /* Enable loading the InstanceID VGPR. */
@@ -7227,6 +7277,21 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                            key->vs_prolog.num_input_sgprs + i, "");
         }
  
+       struct lp_build_if_state wrap_if_state;
+       LLVMValueRef original_ret = ret;
+       bool wrapped = false;
+
+       if (key->vs_prolog.is_monolithic && key->vs_prolog.as_ngg) {
+               LLVMValueRef num_threads;
+               LLVMValueRef ena;
+
+               num_threads = si_unpack_param(ctx, 3, 0, 8);
+               ena = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                                       ac_get_thread_id(&ctx->ac), num_threads, "");
+               lp_build_if(&wrap_if_state, &ctx->gallivm, ena);
+               wrapped = true;
+       }
+
         /* Compute vertex load indices from instance divisors. */
         LLVMValueRef instance_divisor_constbuf = NULL;
  
@@ -7282,6 +7347,20 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                            fninfo.num_params + i, "");
         }
  
+       if (wrapped) {
+               lp_build_endif(&wrap_if_state);
+
+               LLVMValueRef values[2] = {
+                       ret,
+                       original_ret
+               };
+               LLVMBasicBlockRef bbs[2] = {
+                       wrap_if_state.true_block,
+                       wrap_if_state.entry_block
+               };
+               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(ret), 2, values, bbs);
+       }
+
         si_llvm_build_ret(ctx, ret);
  }
  
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h

index f758e99047d03179d12b890970aeaf499eba02a1..5419a7312b1c60975793e4d4d466f79b396d036f 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -186,6 +186,13 @@ struct si_shader_context {
         int param_tes_rel_patch_id;
         /* HW ES */
         int param_es2gs_offset;
+       /* HW GS */
+       /* On gfx10:
+        *  - bits 0..10: ordered_wave_id
+        *  - bits 12..20: number of vertices in group
+        *  - bits 22..30: number of primitives in group
+        */
+       LLVMValueRef gs_tg_info;
         /* API GS */
         int param_gs2vs_offset;
         int param_gs_wave_id; /* GFX6 */
@@ -372,4 +379,8 @@ LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
                              unsigned param, unsigned rshift,
                              unsigned bitwidth);
  
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
+                            unsigned max_outputs,
+                            LLVMValueRef *addrs);
+
  #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c

index 33b40685f04a8c96ad6cd36051c53029867b88f0..a9946d99185368c47debf6e1fc23219b062607a5 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
@@ -1128,7 +1128,7 @@ void si_llvm_create_func(struct si_shader_context *ctx,
         if (ctx->screen->info.chip_class >= GFX9) {
                 if (ctx->shader->key.as_ls)
                         real_shader_type = PIPE_SHADER_TESS_CTRL;
-               else if (ctx->shader->key.as_es)
+               else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
                         real_shader_type = PIPE_SHADER_GEOMETRY;
         }
author	Nicolai Hähnle <nicolai.haehnle@amd.com>
	Thu, 16 Nov 2017 16:00:50 +0000 (17:00 +0100)
committer	Marek Olšák <marek.olsak@amd.com>
	Wed, 3 Jul 2019 19:51:12 +0000 (15:51 -0400)
src/gallium/drivers/radeonsi/Makefile.sources		patch \| blob \| history
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/meson.build		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_internal.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c		patch \| blob \| history