radeonsi: allow generating VS prologs with 0 inputs
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 803c3be4e7721cefffc5834a1fbfab7fa0f64a85..0f96b5f608825e18da4055407856229074921312 100644 (file)
@@ -1081,9 +1081,22 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
        struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
        LLVMValueRef dw_addr, stride;
+       ubyte name, index;
 
        driver_location = driver_location / 4;
 
+       if (load_input) {
+               name = info->input_semantic_name[driver_location];
+               index = info->input_semantic_index[driver_location];
+       } else {
+               name = info->output_semantic_name[driver_location];
+               index = info->output_semantic_index[driver_location];
+       }
+
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
        if (load_input) {
                stride = get_tcs_in_vertex_dw_stride(ctx);
                dw_addr = get_tcs_in_current_patch_offset(ctx);
@@ -1101,16 +1114,6 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
                param_index = LLVMConstInt(ctx->i32, const_index, 0);
        }
 
-       ubyte name;
-       ubyte index;
-       if (load_input) {
-               name = info->input_semantic_name[driver_location];
-               index = info->input_semantic_index[driver_location];
-       } else {
-               name = info->output_semantic_name[driver_location];
-               index = info->output_semantic_index[driver_location];
-       }
-
        dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
                                                      vertex_index, param_index,
                                                      name, index);
@@ -1186,6 +1189,10 @@ LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
        ubyte name = info->input_semantic_name[driver_location];
        ubyte index = info->input_semantic_index[driver_location];
 
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
        base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
 
        if (!param_index) {
@@ -1334,7 +1341,6 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
        struct si_shader_context *ctx = si_shader_context_from_abi(abi);
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
        const unsigned component = var->data.location_frac;
-       const bool is_patch = var->data.patch;
        unsigned driver_location = var->data.driver_location;
        LLVMValueRef dw_addr, stride;
        LLVMValueRef buffer, base, addr;
@@ -1350,6 +1356,14 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
        if (!param_index)
                param_index = LLVMConstInt(ctx->i32, const_index, 0);
 
+       const bool is_patch = var->data.patch ||
+                             var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+                             var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
+
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
        if (!is_patch) {
                stride = get_tcs_out_vertex_dw_stride(ctx);
                dw_addr = get_tcs_out_current_patch_offset(ctx);
@@ -1388,7 +1402,7 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
        addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
                                                               param_index, name, index);
 
-       for (unsigned chan = 0; chan < 8; chan++) {
+       for (unsigned chan = component; chan < 8; chan++) {
                if (!(writemask & (1 << chan)))
                        continue;
                LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
@@ -4501,9 +4515,12 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,
 
        if (!shader->is_gs_copy_shader) {
                /* Vertex load indices. */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
-               for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+               if (shader->selector->info.num_inputs) {
+                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+                                  &ctx->vertex_index0);
+                       for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
+                               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+               }
                *num_prolog_vgprs += shader->selector->info.num_inputs;
        }
 }
@@ -5979,6 +5996,22 @@ static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
        return sel->vs_needs_prolog || key->ls_vgpr_fix;
 }
 
+LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute an ES thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+}
+
+LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute a GS thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+}
+
 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                                 struct nir_shader *nir, bool free_nir)
 {
@@ -6146,7 +6179,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
                           ctx->type == PIPE_SHADER_GEOMETRY ||
                           (shader->key.as_ngg && !shader->key.as_es)) {
-                       LLVMValueRef num_threads;
+                       LLVMValueRef thread_enabled;
                        bool nested_barrier;
 
                        if (!shader->is_monolithic ||
@@ -6163,21 +6196,15 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                                        nested_barrier = true;
                                }
 
-                               /* Number of patches / primitives */
-                               num_threads = si_unpack_param(ctx, ctx->merged_wave_info, 8, 8);
+                               thread_enabled = si_is_gs_thread(ctx);
                        } else {
-                               /* Number of vertices */
-                               num_threads = si_unpack_param(ctx, ctx->merged_wave_info, 0, 8);
+                               thread_enabled = si_is_es_thread(ctx);
                                nested_barrier = false;
                        }
 
-                       LLVMValueRef ena =
-                               LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                                           ac_get_thread_id(&ctx->ac), num_threads, "");
-
                        ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
                        ctx->merged_wrap_if_label = 11500;
-                       ac_build_ifcc(&ctx->ac, ena, ctx->merged_wrap_if_label);
+                       ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
 
                        if (nested_barrier) {
                                /* Execute a barrier before the second shader in
@@ -6247,7 +6274,7 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
        memset(key, 0, sizeof(*key));
        key->vs_prolog.states = *prolog_key;
        key->vs_prolog.num_input_sgprs = num_input_sgprs;
-       key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+       key->vs_prolog.num_inputs = info->num_inputs;
        key->vs_prolog.as_ls = shader_out->key.as_ls;
        key->vs_prolog.as_es = shader_out->key.as_es;
        key->vs_prolog.as_ngg = shader_out->key.as_ngg;
@@ -6915,6 +6942,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                                             shader->info.num_input_sgprs,
                                             &shader->key.part.vs.prolog,
                                             shader, &prolog_key);
+                       prolog_key.vs_prolog.is_monolithic = true;
                        si_build_vs_prolog_function(&ctx, &prolog_key);
                        parts[0] = ctx.main_fn;
                }
@@ -7309,7 +7337,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        memset(&ctx->args, 0, sizeof(ctx->args));
 
        /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
-       returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
+       returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
                         sizeof(LLVMTypeRef));
        num_returns = 0;
 
@@ -7329,7 +7357,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        }
 
        /* Vertex load indices. */
-       for (i = 0; i <= key->vs_prolog.last_input; i++)
+       for (i = 0; i < key->vs_prolog.num_inputs; i++)
                returns[num_returns++] = ctx->f32;
 
        /* Create the function. */
@@ -7404,22 +7432,6 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                           key->vs_prolog.num_input_sgprs + i, "");
        }
 
-       LLVMValueRef original_ret = ret;
-       bool wrapped = false;
-       LLVMBasicBlockRef if_entry_block = NULL;
-
-       if (key->vs_prolog.is_monolithic && key->vs_prolog.as_ngg) {
-               LLVMValueRef num_threads;
-               LLVMValueRef ena;
-
-               num_threads = si_unpack_param(ctx, merged_wave_info, 0, 8);
-               ena = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                                       ac_get_thread_id(&ctx->ac), num_threads, "");
-               if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
-               ac_build_ifcc(&ctx->ac, ena, 11501);
-               wrapped = true;
-       }
-
        /* Compute vertex load indices from instance divisors. */
        LLVMValueRef instance_divisor_constbuf = NULL;
 
@@ -7431,7 +7443,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                        ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
        }
 
-       for (i = 0; i <= key->vs_prolog.last_input; i++) {
+       for (i = 0; i < key->vs_prolog.num_inputs; i++) {
                bool divisor_is_one =
                        key->vs_prolog.states.instance_divisor_is_one & (1u << i);
                bool divisor_is_fetched =
@@ -7475,20 +7487,6 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                           ctx->args.arg_count + i, "");
        }
 
-       if (wrapped) {
-               LLVMBasicBlockRef bbs[2] = {
-                       LLVMGetInsertBlock(ctx->ac.builder),
-                       if_entry_block,
-               };
-               ac_build_endif(&ctx->ac, 11501);
-
-               LLVMValueRef values[2] = {
-                       ret,
-                       original_ret
-               };
-               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(ret), 2, values, bbs);
-       }
-
        si_llvm_build_ret(ctx, ret);
 }