radeonsi: allow generating VS prologs with 0 inputs

[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index 803c3be4e7721cefffc5834a1fbfab7fa0f64a85..0f96b5f608825e18da4055407856229074921312 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1081,9 +1081,22 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
         struct tgsi_shader_info *info = &ctx->shader->selector->info;
         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
         LLVMValueRef dw_addr, stride;
+       ubyte name, index;
  
         driver_location = driver_location / 4;
  
+       if (load_input) {
+               name = info->input_semantic_name[driver_location];
+               index = info->input_semantic_index[driver_location];
+       } else {
+               name = info->output_semantic_name[driver_location];
+               index = info->output_semantic_index[driver_location];
+       }
+
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
         if (load_input) {
                 stride = get_tcs_in_vertex_dw_stride(ctx);
                 dw_addr = get_tcs_in_current_patch_offset(ctx);
@@ -1101,16 +1114,6 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
         }
  
-       ubyte name;
-       ubyte index;
-       if (load_input) {
-               name = info->input_semantic_name[driver_location];
-               index = info->input_semantic_index[driver_location];
-       } else {
-               name = info->output_semantic_name[driver_location];
-               index = info->output_semantic_index[driver_location];
-       }
-
         dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
                                                       vertex_index, param_index,
                                                       name, index);
@@ -1186,6 +1189,10 @@ LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
         ubyte name = info->input_semantic_name[driver_location];
         ubyte index = info->input_semantic_index[driver_location];
  
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
         base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
  
         if (!param_index) {
@@ -1334,7 +1341,6 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
         struct tgsi_shader_info *info = &ctx->shader->selector->info;
         const unsigned component = var->data.location_frac;
-       const bool is_patch = var->data.patch;
         unsigned driver_location = var->data.driver_location;
         LLVMValueRef dw_addr, stride;
         LLVMValueRef buffer, base, addr;
@@ -1350,6 +1356,14 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
         if (!param_index)
                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
  
+       const bool is_patch = var->data.patch ||
+                             var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+                             var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
+
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
         if (!is_patch) {
                 stride = get_tcs_out_vertex_dw_stride(ctx);
                 dw_addr = get_tcs_out_current_patch_offset(ctx);
@@ -1388,7 +1402,7 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
                                                                param_index, name, index);
  
-       for (unsigned chan = 0; chan < 8; chan++) {
+       for (unsigned chan = component; chan < 8; chan++) {
                 if (!(writemask & (1 << chan)))
                         continue;
                 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
@@ -4501,9 +4515,12 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,
  
         if (!shader->is_gs_copy_shader) {
                 /* Vertex load indices. */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
-               for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+               if (shader->selector->info.num_inputs) {
+                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+                                  &ctx->vertex_index0);
+                       for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
+                               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+               }
                 *num_prolog_vgprs += shader->selector->info.num_inputs;
         }
  }
@@ -5979,6 +5996,22 @@ static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
         return sel->vs_needs_prolog || key->ls_vgpr_fix;
  }
  
+LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute an ES thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+}
+
+LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute a GS thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+}
+
  static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                                  struct nir_shader *nir, bool free_nir)
  {
@@ -6146,7 +6179,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
                            ctx->type == PIPE_SHADER_GEOMETRY ||
                            (shader->key.as_ngg && !shader->key.as_es)) {
-                       LLVMValueRef num_threads;
+                       LLVMValueRef thread_enabled;
                         bool nested_barrier;
  
                         if (!shader->is_monolithic ||
@@ -6163,21 +6196,15 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                                         nested_barrier = true;
                                 }
  
-                               /* Number of patches / primitives */
-                               num_threads = si_unpack_param(ctx, ctx->merged_wave_info, 8, 8);
+                               thread_enabled = si_is_gs_thread(ctx);
                         } else {
-                               /* Number of vertices */
-                               num_threads = si_unpack_param(ctx, ctx->merged_wave_info, 0, 8);
+                               thread_enabled = si_is_es_thread(ctx);
                                 nested_barrier = false;
                         }
  
-                       LLVMValueRef ena =
-                               LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                                           ac_get_thread_id(&ctx->ac), num_threads, "");
-
                         ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
                         ctx->merged_wrap_if_label = 11500;
-                       ac_build_ifcc(&ctx->ac, ena, ctx->merged_wrap_if_label);
+                       ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
  
                         if (nested_barrier) {
                                 /* Execute a barrier before the second shader in
@@ -6247,7 +6274,7 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
         memset(key, 0, sizeof(*key));
         key->vs_prolog.states = *prolog_key;
         key->vs_prolog.num_input_sgprs = num_input_sgprs;
-       key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+       key->vs_prolog.num_inputs = info->num_inputs;
         key->vs_prolog.as_ls = shader_out->key.as_ls;
         key->vs_prolog.as_es = shader_out->key.as_es;
         key->vs_prolog.as_ngg = shader_out->key.as_ngg;
@@ -6915,6 +6942,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                                              shader->info.num_input_sgprs,
                                              &shader->key.part.vs.prolog,
                                              shader, &prolog_key);
+                       prolog_key.vs_prolog.is_monolithic = true;
                         si_build_vs_prolog_function(&ctx, &prolog_key);
                         parts[0] = ctx.main_fn;
                 }
@@ -7309,7 +7337,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
         memset(&ctx->args, 0, sizeof(ctx->args));
  
         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
-       returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
+       returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
                          sizeof(LLVMTypeRef));
         num_returns = 0;
  
@@ -7329,7 +7357,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
         }
  
         /* Vertex load indices. */
-       for (i = 0; i <= key->vs_prolog.last_input; i++)
+       for (i = 0; i < key->vs_prolog.num_inputs; i++)
                 returns[num_returns++] = ctx->f32;
  
         /* Create the function. */
@@ -7404,22 +7432,6 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                            key->vs_prolog.num_input_sgprs + i, "");
         }
  
-       LLVMValueRef original_ret = ret;
-       bool wrapped = false;
-       LLVMBasicBlockRef if_entry_block = NULL;
-
-       if (key->vs_prolog.is_monolithic && key->vs_prolog.as_ngg) {
-               LLVMValueRef num_threads;
-               LLVMValueRef ena;
-
-               num_threads = si_unpack_param(ctx, merged_wave_info, 0, 8);
-               ena = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                                       ac_get_thread_id(&ctx->ac), num_threads, "");
-               if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
-               ac_build_ifcc(&ctx->ac, ena, 11501);
-               wrapped = true;
-       }
-
         /* Compute vertex load indices from instance divisors. */
         LLVMValueRef instance_divisor_constbuf = NULL;
  
@@ -7431,7 +7443,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                         ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
         }
  
-       for (i = 0; i <= key->vs_prolog.last_input; i++) {
+       for (i = 0; i < key->vs_prolog.num_inputs; i++) {
                 bool divisor_is_one =
                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
                 bool divisor_is_fetched =
@@ -7475,20 +7487,6 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                            ctx->args.arg_count + i, "");
         }
  
-       if (wrapped) {
-               LLVMBasicBlockRef bbs[2] = {
-                       LLVMGetInsertBlock(ctx->ac.builder),
-                       if_entry_block,
-               };
-               ac_build_endif(&ctx->ac, 11501);
-
-               LLVMValueRef values[2] = {
-                       ret,
-                       original_ret
-               };
-               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(ret), 2, values, bbs);
-       }
-
         si_llvm_build_ret(ctx, ret);
  }