radeonsi: allow generating VS prologs with 0 inputs
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index b63a39efe2da5e67a492a4608030c6759e9bd423..0f96b5f608825e18da4055407856229074921312 100644 (file)
@@ -678,10 +678,7 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context
                                                        LLVMValueRef base_addr,
                                                        LLVMValueRef vertex_index,
                                                        LLVMValueRef param_index,
-                                                       unsigned input_index,
-                                                       ubyte *name,
-                                                       ubyte *index,
-                                                       bool is_patch)
+                                                       ubyte name, ubyte index)
 {
        if (vertex_dw_stride) {
                base_addr = ac_build_imad(&ctx->ac, vertex_index,
@@ -693,11 +690,11 @@ static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context
                                          LLVMConstInt(ctx->i32, 4, 0), base_addr);
        }
 
-       int param = is_patch ?
-               si_shader_io_get_unique_index_patch(name[input_index],
-                                                   index[input_index]) :
-               si_shader_io_get_unique_index(name[input_index],
-                                             index[input_index], false);
+       int param = name == TGSI_SEMANTIC_PATCH ||
+                   name == TGSI_SEMANTIC_TESSINNER ||
+                   name == TGSI_SEMANTIC_TESSOUTER ?
+               si_shader_io_get_unique_index_patch(name, index) :
+               si_shader_io_get_unique_index(name, index, false);
 
        /* Add the base address of the element. */
        return LLVMBuildAdd(ctx->ac.builder, base_addr,
@@ -772,9 +769,8 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 
        return get_dw_address_from_generic_indices(ctx, vertex_dw_stride,
                                                   base_addr, vertex_index,
-                                                  ind_index, input_index,
-                                                  name, index,
-                                                  !reg.Register.Dimension);
+                                                  ind_index, name[input_index],
+                                                  index[input_index]);
 }
 
 /* The offchip buffer layout for TCS->TES is
@@ -836,16 +832,15 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
                                        struct si_shader_context *ctx,
                                        LLVMValueRef vertex_index,
                                        LLVMValueRef param_index,
-                                       unsigned param_base,
-                                       ubyte *name,
-                                       ubyte *index,
-                                       bool is_patch)
+                                       ubyte name, ubyte index)
 {
        unsigned param_index_base;
 
-       param_index_base = is_patch ?
-               si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) :
-               si_shader_io_get_unique_index(name[param_base], index[param_base], false);
+       param_index_base = name == TGSI_SEMANTIC_PATCH ||
+                          name == TGSI_SEMANTIC_TESSINNER ||
+                          name == TGSI_SEMANTIC_TESSOUTER ?
+               si_shader_io_get_unique_index_patch(name, index) :
+               si_shader_io_get_unique_index(name, index, false);
 
        if (param_index) {
                param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
@@ -874,7 +869,6 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
        reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 
        if (reg.Register.Dimension) {
-
                if (reg.Dimension.Indirect)
                        vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
                                                             1, reg.Dimension.Index);
@@ -904,14 +898,13 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 
                param_index = si_get_indirect_index(ctx, &reg.Indirect,
                                                    1, reg.Register.Index - param_base);
-
        } else {
                param_base = reg.Register.Index;
        }
 
        return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-                                                              param_index, param_base,
-                                                              name, index, !reg.Register.Dimension);
+                                                              param_index, name[param_base],
+                                                              index[param_base]);
 }
 
 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
@@ -1088,9 +1081,22 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
        struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
        LLVMValueRef dw_addr, stride;
+       ubyte name, index;
 
        driver_location = driver_location / 4;
 
+       if (load_input) {
+               name = info->input_semantic_name[driver_location];
+               index = info->input_semantic_index[driver_location];
+       } else {
+               name = info->output_semantic_name[driver_location];
+               index = info->output_semantic_index[driver_location];
+       }
+
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
        if (load_input) {
                stride = get_tcs_in_vertex_dw_stride(ctx);
                dw_addr = get_tcs_in_current_patch_offset(ctx);
@@ -1108,21 +1114,9 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
                param_index = LLVMConstInt(ctx->i32, const_index, 0);
        }
 
-       ubyte *names;
-       ubyte *indices;
-       if (load_input) {
-               names = info->input_semantic_name;
-               indices = info->input_semantic_index;
-       } else {
-               names = info->output_semantic_name;
-               indices = info->output_semantic_index;
-       }
-
        dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
                                                      vertex_index, param_index,
-                                                     driver_location,
-                                                     names, indices,
-                                                     is_patch);
+                                                     name, index);
 
        LLVMValueRef value[4];
        for (unsigned i = 0; i < num_components; i++) {
@@ -1192,6 +1186,12 @@ LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
        LLVMValueRef base, addr;
 
        driver_location = driver_location / 4;
+       ubyte name = info->input_semantic_name[driver_location];
+       ubyte index = info->input_semantic_index[driver_location];
+
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
 
        base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
 
@@ -1200,10 +1200,8 @@ LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
        }
 
        addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-                                                              param_index, driver_location,
-                                                              info->input_semantic_name,
-                                                              info->input_semantic_index,
-                                                              is_patch);
+                                                              param_index,
+                                                              name, index);
 
        /* TODO: This will generate rather ordinary llvm code, although it
         * should be easy for the optimiser to fix up. In future we might want
@@ -1216,13 +1214,12 @@ LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
                if (llvm_type_is_64bit(ctx, type)) {
                        offset *= 2;
                        if (offset == 4) {
+                               ubyte name = info->input_semantic_name[driver_location + 1];
+                               ubyte index = info->input_semantic_index[driver_location + 1];
                                 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
                                                                                        vertex_index,
                                                                                        param_index,
-                                                                                       driver_location + 1,
-                                                                                       info->input_semantic_name,
-                                                                                       info->input_semantic_index,
-                                                                                       is_patch);
+                                                                                      name, index);
                        }
 
                         offset = offset % 4;
@@ -1309,7 +1306,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
                if (reg->Register.WriteMask != 0xF && !is_tess_factor) {
                        ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
                                                    buf_addr, base,
-                                                   4 * chan_index, ac_glc, false);
+                                                   4 * chan_index, ac_glc);
                }
 
                /* Write tess factors into VGPRs for the epilog. */
@@ -1329,7 +1326,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
                LLVMValueRef value = ac_build_gather_values(&ctx->ac,
                                                            values, 4);
                ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
-                                           base, 0, ac_glc, false);
+                                           base, 0, ac_glc);
        }
 }
 
@@ -1344,7 +1341,6 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
        struct si_shader_context *ctx = si_shader_context_from_abi(abi);
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
        const unsigned component = var->data.location_frac;
-       const bool is_patch = var->data.patch;
        unsigned driver_location = var->data.driver_location;
        LLVMValueRef dw_addr, stride;
        LLVMValueRef buffer, base, addr;
@@ -1353,30 +1349,34 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
        bool is_tess_factor = false, is_tess_inner = false;
 
        driver_location = driver_location / 4;
+       ubyte name = info->output_semantic_name[driver_location];
+       ubyte index = info->output_semantic_index[driver_location];
 
        bool is_const = !param_index;
        if (!param_index)
                param_index = LLVMConstInt(ctx->i32, const_index, 0);
 
+       const bool is_patch = var->data.patch ||
+                             var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+                             var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
+
+       assert((name == TGSI_SEMANTIC_PATCH ||
+               name == TGSI_SEMANTIC_TESSINNER ||
+               name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
+
        if (!is_patch) {
                stride = get_tcs_out_vertex_dw_stride(ctx);
                dw_addr = get_tcs_out_current_patch_offset(ctx);
                dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
                                                              vertex_index, param_index,
-                                                             driver_location,
-                                                             info->output_semantic_name,
-                                                             info->output_semantic_index,
-                                                             is_patch);
+                                                             name, index);
 
                skip_lds_store = !info->reads_pervertex_outputs;
        } else {
                dw_addr = get_tcs_out_current_patch_data_offset(ctx);
                dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
                                                              vertex_index, param_index,
-                                                             driver_location,
-                                                             info->output_semantic_name,
-                                                             info->output_semantic_index,
-                                                             is_patch);
+                                                             name, index);
 
                skip_lds_store = !info->reads_perpatch_outputs;
 
@@ -1400,25 +1400,21 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
        base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
 
        addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
-                                                              param_index, driver_location,
-                                                              info->output_semantic_name,
-                                                              info->output_semantic_index,
-                                                              is_patch);
+                                                              param_index, name, index);
 
-       for (unsigned chan = 0; chan < 8; chan++) {
+       for (unsigned chan = component; chan < 8; chan++) {
                if (!(writemask & (1 << chan)))
                        continue;
                LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
 
                unsigned buffer_store_offset = chan % 4;
                if (chan == 4) {
+                       ubyte name = info->output_semantic_name[driver_location + 1];
+                       ubyte index = info->output_semantic_index[driver_location + 1];
                         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
                                                                                vertex_index,
                                                                                param_index,
-                                                                               driver_location + 1,
-                                                                               info->output_semantic_name,
-                                                                               info->output_semantic_index,
-                                                                               is_patch);
+                                                                              name, index);
                }
 
                /* Skip LDS stores if there is no LDS read of this output. */
@@ -1432,7 +1428,7 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
                        ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
                                                    addr, base,
                                                    4 * buffer_store_offset,
-                                                    ac_glc, false);
+                                                    ac_glc);
                }
 
                /* Write tess factors into VGPRs for the epilog. */
@@ -1452,7 +1448,7 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
                LLVMValueRef value = ac_build_gather_values(&ctx->ac,
                                                            values, 4);
                ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
-                                           base, 0, ac_glc, false);
+                                           base, 0, ac_glc);
        }
 }
 
@@ -2661,7 +2657,7 @@ void si_emit_streamout_output(struct si_shader_context *ctx,
                                    vdata, num_comps,
                                    so_write_offsets[buf_idx],
                                    ctx->i32_0,
-                                   stream_out->dst_offset * 4, ac_glc | ac_slc, false);
+                                   stream_out->dst_offset * 4, ac_glc | ac_slc);
 }
 
 /**
@@ -3066,7 +3062,7 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
                LLVMValueRef value = lshs_lds_load(bld_base, ctx->ac.i32, ~0, lds_ptr);
 
                ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
-                                           buffer_offset, 0, ac_glc, false);
+                                           buffer_offset, 0, ac_glc);
        }
 }
 
@@ -3191,7 +3187,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                ac_build_buffer_store_dword(&ctx->ac, buffer,
                                            LLVMConstInt(ctx->i32, 0x80000000, 0),
                                            1, ctx->i32_0, tf_base,
-                                           offset, ac_glc, false);
+                                           offset, ac_glc);
                offset += 4;
        }
 
@@ -3200,12 +3196,12 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
        /* Store the tessellation factors. */
        ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
                                    MIN2(stride, 4), byteoffset, tf_base,
-                                   offset, ac_glc, false);
+                                   offset, ac_glc);
        offset += 16;
        if (vec1)
                ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
                                            stride - 4, byteoffset, tf_base,
-                                           offset, ac_glc, false);
+                                           offset, ac_glc);
 
        /* Store the tess factors into the offchip buffer if TES reads them. */
        if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
@@ -3228,7 +3224,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
 
                ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
                                            outer_comps, tf_outer_offset,
-                                           base, 0, ac_glc, false);
+                                           base, 0, ac_glc);
                if (inner_comps) {
                        param_inner = si_shader_io_get_unique_index_patch(
                                              TGSI_SEMANTIC_TESSINNER, 0);
@@ -3239,7 +3235,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                    ac_build_gather_values(&ctx->ac, inner, inner_comps);
                        ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
                                                    inner_comps, tf_inner_offset,
-                                                   base, 0, ac_glc, false);
+                                                   base, 0, ac_glc);
                }
        }
 
@@ -3554,7 +3550,7 @@ static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
                                                    out_val, 1, NULL,
                                                    ac_get_arg(&ctx->ac, ctx->es2gs_offset),
                                                    (4 * param + chan) * 4,
-                                                   ac_glc | ac_slc, true);
+                                                   ac_glc | ac_slc | ac_swizzled);
                }
        }
 
@@ -4283,7 +4279,7 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
                                                    ctx->gsvs_ring[stream],
                                                    out_val, 1,
                                                    voffset, soffset, 0,
-                                                   ac_glc | ac_slc, true);
+                                                   ac_glc | ac_slc | ac_swizzled);
                }
        }
 
@@ -4519,9 +4515,12 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx,
 
        if (!shader->is_gs_copy_shader) {
                /* Vertex load indices. */
-               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
-               for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
-                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+               if (shader->selector->info.num_inputs) {
+                       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+                                  &ctx->vertex_index0);
+                       for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
+                               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+               }
                *num_prolog_vgprs += shader->selector->info.num_inputs;
        }
 }
@@ -4904,13 +4903,17 @@ static void create_function(struct si_shader_context *ctx)
                                   &ctx->cs_user_data);
                }
 
+               /* Hardware SGPRs. */
                for (i = 0; i < 3; i++) {
                        if (shader->selector->info.uses_block_id[i]) {
                                ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
                                           &ctx->args.workgroup_ids[i]);
                        }
                }
+               if (shader->selector->info.uses_subgroup_info)
+                       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
 
+               /* Hardware VGPRs. */
                ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
                           &ctx->args.local_invocation_ids);
                break;
@@ -5993,6 +5996,22 @@ static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
        return sel->vs_needs_prolog || key->ls_vgpr_fix;
 }
 
+LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute an ES thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
+}
+
+LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
+{
+       /* Return true if the current thread should execute a GS thread. */
+       return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                            ac_get_thread_id(&ctx->ac),
+                            si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
+}
+
 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                                 struct nir_shader *nir, bool free_nir)
 {
@@ -6160,7 +6179,7 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
                           ctx->type == PIPE_SHADER_GEOMETRY ||
                           (shader->key.as_ngg && !shader->key.as_es)) {
-                       LLVMValueRef num_threads;
+                       LLVMValueRef thread_enabled;
                        bool nested_barrier;
 
                        if (!shader->is_monolithic ||
@@ -6177,21 +6196,15 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                                        nested_barrier = true;
                                }
 
-                               /* Number of patches / primitives */
-                               num_threads = si_unpack_param(ctx, ctx->merged_wave_info, 8, 8);
+                               thread_enabled = si_is_gs_thread(ctx);
                        } else {
-                               /* Number of vertices */
-                               num_threads = si_unpack_param(ctx, ctx->merged_wave_info, 0, 8);
+                               thread_enabled = si_is_es_thread(ctx);
                                nested_barrier = false;
                        }
 
-                       LLVMValueRef ena =
-                               LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                                           ac_get_thread_id(&ctx->ac), num_threads, "");
-
                        ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
                        ctx->merged_wrap_if_label = 11500;
-                       ac_build_ifcc(&ctx->ac, ena, ctx->merged_wrap_if_label);
+                       ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
 
                        if (nested_barrier) {
                                /* Execute a barrier before the second shader in
@@ -6261,7 +6274,7 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
        memset(key, 0, sizeof(*key));
        key->vs_prolog.states = *prolog_key;
        key->vs_prolog.num_input_sgprs = num_input_sgprs;
-       key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+       key->vs_prolog.num_inputs = info->num_inputs;
        key->vs_prolog.as_ls = shader_out->key.as_ls;
        key->vs_prolog.as_es = shader_out->key.as_es;
        key->vs_prolog.as_ngg = shader_out->key.as_ngg;
@@ -6929,6 +6942,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                                             shader->info.num_input_sgprs,
                                             &shader->key.part.vs.prolog,
                                             shader, &prolog_key);
+                       prolog_key.vs_prolog.is_monolithic = true;
                        si_build_vs_prolog_function(&ctx, &prolog_key);
                        parts[0] = ctx.main_fn;
                }
@@ -7323,7 +7337,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        memset(&ctx->args, 0, sizeof(ctx->args));
 
        /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
-       returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
+       returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
                         sizeof(LLVMTypeRef));
        num_returns = 0;
 
@@ -7343,7 +7357,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
        }
 
        /* Vertex load indices. */
-       for (i = 0; i <= key->vs_prolog.last_input; i++)
+       for (i = 0; i < key->vs_prolog.num_inputs; i++)
                returns[num_returns++] = ctx->f32;
 
        /* Create the function. */
@@ -7418,22 +7432,6 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                           key->vs_prolog.num_input_sgprs + i, "");
        }
 
-       LLVMValueRef original_ret = ret;
-       bool wrapped = false;
-       LLVMBasicBlockRef if_entry_block = NULL;
-
-       if (key->vs_prolog.is_monolithic && key->vs_prolog.as_ngg) {
-               LLVMValueRef num_threads;
-               LLVMValueRef ena;
-
-               num_threads = si_unpack_param(ctx, merged_wave_info, 0, 8);
-               ena = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
-                                       ac_get_thread_id(&ctx->ac), num_threads, "");
-               if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
-               ac_build_ifcc(&ctx->ac, ena, 11501);
-               wrapped = true;
-       }
-
        /* Compute vertex load indices from instance divisors. */
        LLVMValueRef instance_divisor_constbuf = NULL;
 
@@ -7445,7 +7443,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                        ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
        }
 
-       for (i = 0; i <= key->vs_prolog.last_input; i++) {
+       for (i = 0; i < key->vs_prolog.num_inputs; i++) {
                bool divisor_is_one =
                        key->vs_prolog.states.instance_divisor_is_one & (1u << i);
                bool divisor_is_fetched =
@@ -7489,20 +7487,6 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                           ctx->args.arg_count + i, "");
        }
 
-       if (wrapped) {
-               LLVMBasicBlockRef bbs[2] = {
-                       LLVMGetInsertBlock(ctx->ac.builder),
-                       if_entry_block,
-               };
-               ac_build_endif(&ctx->ac, 11501);
-
-               LLVMValueRef values[2] = {
-                       ret,
-                       original_ret
-               };
-               ret = ac_build_phi(&ctx->ac, LLVMTypeOf(ret), 2, values, bbs);
-       }
-
        si_llvm_build_ret(ctx, ret);
 }