radeonsi/gfx9: proper workaround for LS/HS VGPR initialization bug

author Nicolai Hähnle <nicolai.haehnle@amd.com>

Tue, 29 Aug 2017 21:11:38 +0000 (23:11 +0200)

committer Nicolai Hähnle <nicolai.haehnle@amd.com>

Wed, 6 Sep 2017 08:02:49 +0000 (10:02 +0200)
author Nicolai Hähnle <nicolai.haehnle@amd.com>
Tue, 29 Aug 2017 21:11:38 +0000 (23:11 +0200)
committer Nicolai Hähnle <nicolai.haehnle@amd.com>
Wed, 6 Sep 2017 08:02:49 +0000 (10:02 +0200)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index dde0c115dc5556ecfbedf8d143b31a22d8f5927b..8db7028c9a109bb89acfb02243ce2338ccc9a01a 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -389,6 +389,7 @@ struct si_context {
  
         /* Emitted draw state. */
         bool                    gs_tri_strip_adj_fix:1;
+       bool                    ls_vgpr_fix:1;
         int                     last_index_size;
         int                     last_base_vertex;
         int                     last_start_instance;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index 0e89ccac09db902f4d1a55fb72f308a0f6120233..db8297ddc4abf43ae7d61b960e412123f7508dc3 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5444,6 +5444,8 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
                 prefix, prolog->instance_divisor_is_one);
         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
                 prefix, prolog->instance_divisor_is_fetched);
+       fprintf(f, "  %s.ls_vgpr_fix = %u\n",
+               prefix, prolog->ls_vgpr_fix);
  
         fprintf(f, "  mono.vs.fix_fetch = {");
         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
@@ -5636,6 +5638,14 @@ static void si_init_exec_from_input(struct si_shader_context *ctx,
                            ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
  }
  
+static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
+                              const struct si_vs_prolog_bits *key)
+{
+       /* VGPR initialization fixup for Vega10 and Raven is always done in the
+        * VS prolog. */
+       return sel->vs_needs_prolog || key->ls_vgpr_fix;
+}
+
  static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                                  bool is_monolithic)
  {
@@ -5712,7 +5722,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                     (shader->key.as_es || shader->key.as_ls) &&
                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
                      (ctx->type == PIPE_SHADER_VERTEX &&
-                     !sel->vs_needs_prolog))) {
+                     !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
                         si_init_exec_from_input(ctx,
                                                 ctx->param_merged_wave_info, 0);
                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
@@ -6364,6 +6374,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                 if (sscreen->b.chip_class >= GFX9) {
                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
                         LLVMValueRef parts[4];
+                       bool vs_needs_prolog =
+                               si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
  
                         /* TCS main part */
                         parts[2] = ctx.main_fn;
@@ -6376,7 +6388,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                         parts[3] = ctx.main_fn;
  
                         /* VS prolog */
-                       if (ls->vs_needs_prolog) {
+                       if (vs_needs_prolog) {
                                 union si_shader_part_key vs_prolog_key;
                                 si_get_vs_prolog_key(&ls->info,
                                                      shader->info.num_input_sgprs,
@@ -6407,9 +6419,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                         ctx.type = PIPE_SHADER_TESS_CTRL;
  
                         si_build_wrapper_function(&ctx,
-                                                 parts + !ls->vs_needs_prolog,
-                                                 4 - !ls->vs_needs_prolog, 0,
-                                                 ls->vs_needs_prolog ? 2 : 1);
+                                                 parts + !vs_needs_prolog,
+                                                 4 - !vs_needs_prolog, 0,
+                                                 vs_needs_prolog ? 2 : 1);
                 } else {
                         LLVMValueRef parts[2];
                         union si_shader_part_key epilog_key;
@@ -6746,9 +6758,9 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
         LLVMTypeRef *returns;
         LLVMValueRef ret, func;
         int num_returns, i;
-       unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
-                                key->vs_prolog.num_merged_next_stage_vgprs;
+       unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+       LLVMValueRef input_vgprs[9];
         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
                                       num_input_vgprs;
         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
@@ -6768,13 +6780,10 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  
         /* Preloaded VGPRs (outputs must be floats) */
         for (i = 0; i < num_input_vgprs; i++) {
-               add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
                 returns[num_returns++] = ctx->f32;
         }
  
-       fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
-       fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
-
         /* Vertex load indices. */
         for (i = 0; i <= key->vs_prolog.last_input; i++)
                 returns[num_returns++] = ctx->f32;
@@ -6783,9 +6792,33 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
         si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
         func = ctx->main_fn;
  
-       if (key->vs_prolog.num_merged_next_stage_vgprs &&
-           !key->vs_prolog.is_monolithic)
-               si_init_exec_from_input(ctx, 3, 0);
+       if (key->vs_prolog.num_merged_next_stage_vgprs) {
+               if (!key->vs_prolog.is_monolithic)
+                       si_init_exec_from_input(ctx, 3, 0);
+
+               if (key->vs_prolog.as_ls &&
+                   (ctx->screen->b.family == CHIP_VEGA10 ||
+                    ctx->screen->b.family == CHIP_RAVEN)) {
+                       /* If there are no HS threads, SPI loads the LS VGPRs
+                        * starting at VGPR 0. Shift them back to where they
+                        * belong.
+                        */
+                       LLVMValueRef has_hs_threads =
+                               LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+                                   unpack_param(ctx, 3, 8, 8),
+                                   ctx->i32_0, "");
+
+                       for (i = 4; i > 0; --i) {
+                               input_vgprs[i + 1] =
+                                       LLVMBuildSelect(gallivm->builder, has_hs_threads,
+                                                       input_vgprs[i + 1],
+                                                       input_vgprs[i - 1], "");
+                       }
+               }
+       }
+
+       ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
+       ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
  
         /* Copy inputs to outputs. This should be no-op, as the registers match,
          * but it will prevent the compiler from overwriting them unintentionally.
@@ -6795,10 +6828,11 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                 LLVMValueRef p = LLVMGetParam(func, i);
                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
         }
-       for (; i < fninfo.num_params; i++) {
-               LLVMValueRef p = LLVMGetParam(func, i);
+       for (i = 0; i < num_input_vgprs; i++) {
+               LLVMValueRef p = input_vgprs[i];
                 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
-               ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
+                                          key->vs_prolog.num_input_sgprs + i, "");
         }
  
         /* Compute vertex load indices from instance divisors. */
@@ -6859,8 +6893,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
  {
         struct si_shader_selector *vs = main_part->selector;
  
-       /* The prolog is a no-op if there are no inputs. */
-       if (!vs->vs_needs_prolog)
+       if (!si_vs_needs_prolog(vs, key))
                 return true;
  
         /* Get the prolog. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h

index 0c0fa10f40fce4d5519d703e2b22496fa42150e3..ee6b0c167f9e14e5709e034181e309a11ae7730c 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -398,6 +398,7 @@ struct si_vs_prolog_bits {
          */
         uint16_t        instance_divisor_is_one;     /* bitmask of inputs */
         uint16_t        instance_divisor_is_fetched; /* bitmask of inputs */
+       unsigned        ls_vgpr_fix:1;
  };
  
  /* Common TCS bits between the shader key and the epilog key. */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c

index 7ee6cf88e8822906289739306dd3419c3eff21d0..051dfea8f7c6747d191329e5655298957958a3c6 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -195,11 +195,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
          */
         *num_patches = MIN2(*num_patches, 40);
  
-       if (sctx->b.chip_class == SI ||
-           /* TODO: fix GFX9 where a threadgroup contains more than 1 wave and
-            * LS vertices per patch > HS vertices per patch. Piglit: 16in-1out */
-           (sctx->b.chip_class == GFX9 &&
-            num_tcs_input_cp > num_tcs_output_cp)) {
+       if (sctx->b.chip_class == SI) {
                 /* SI bug workaround, related to power management. Limit LS-HS
                  * threadgroups to only one wave.
                  */
@@ -1264,6 +1260,27 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                 sctx->do_update_shaders = true;
         }
  
+       if (sctx->tes_shader.cso &&
+           (sctx->b.family == CHIP_VEGA10 || sctx->b.family == CHIP_RAVEN)) {
+               /* Determine whether the LS VGPR fix should be applied.
+                *
+                * It is only required when num input CPs > num output CPs,
+                * which cannot happen with the fixed function TCS. We should
+                * also update this bit when switching from TCS to fixed
+                * function TCS.
+                */
+               struct si_shader_selector *tcs = sctx->tcs_shader.cso;
+               bool ls_vgpr_fix =
+                       tcs &&
+                       info->vertices_per_patch >
+                       tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+               if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+                       sctx->ls_vgpr_fix = ls_vgpr_fix;
+                       sctx->do_update_shaders = true;
+               }
+       }
+
         if (sctx->gs_shader.cso) {
                 /* Determine whether the GS triangle strip adjacency fix should
                  * be applied. Rotate every other triangle if
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index 4f04bbdfaffaff17698c245c87ec67acc0aedf4e..d8791a2a62ef81419f03ea7924c9476eba6903d7 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1283,6 +1283,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
                         si_shader_selector_key_vs(sctx, sctx->vs_shader.cso,
                                                   key, &key->part.tcs.ls_prolog);
                         key->part.tcs.ls = sctx->vs_shader.cso;
+
+                       /* When the LS VGPR fix is needed, monolithic shaders
+                        * can:
+                        *  - avoid initializing EXEC in both the LS prolog
+                        *    and the LS main part when !vs_needs_prolog
+                        *  - remove the fixup for unused input VGPRs
+                        */
+                       key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
+                       key->opt.prefer_mono = sctx->ls_vgpr_fix;
                 }
  
                 key->part.tcs.epilog.prim_mode =
author	Nicolai Hähnle <nicolai.haehnle@amd.com>
	Tue, 29 Aug 2017 21:11:38 +0000 (23:11 +0200)
committer	Nicolai Hähnle <nicolai.haehnle@amd.com>
	Wed, 6 Sep 2017 08:02:49 +0000 (10:02 +0200)
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_draw.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_shaders.c		patch \| blob \| history