X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_nir_to_llvm.c;h=51416262df19edb2f7b2f1834cb47b9dcd065376;hp=148a571fc79aa6e4e9e1a70e493bdf023232ee72;hb=2a6811f0f981c8d67d0131a0b74549b641ea2247;hpb=43da33c1695132ee094aac80991852c4954bf758 diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 148a571fc79..51416262df1 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -28,13 +28,10 @@ #include "radv_private.h" #include "radv_shader.h" #include "radv_shader_helper.h" +#include "radv_shader_args.h" +#include "radv_debug.h" #include "nir/nir.h" -#include -#include -#include -#include - #include "sid.h" #include "ac_binary.h" #include "ac_llvm_util.h" @@ -47,38 +44,22 @@ struct radv_shader_context { struct ac_llvm_context ac; - const struct radv_nir_compiler_options *options; - struct radv_shader_info *shader_info; const struct nir_shader *shader; struct ac_shader_abi abi; + const struct radv_shader_args *args; + + gl_shader_stage stage; unsigned max_workgroup_size; LLVMContextRef context; LLVMValueRef main_function; LLVMValueRef descriptor_sets[MAX_SETS]; + LLVMValueRef ring_offsets; - LLVMValueRef vertex_buffers; LLVMValueRef rel_auto_id; - LLVMValueRef vs_prim_id; - LLVMValueRef es2gs_offset; - - LLVMValueRef oc_lds; - LLVMValueRef merged_wave_info; - LLVMValueRef tess_factor_offset; - LLVMValueRef tes_rel_patch_id; - LLVMValueRef tes_u; - LLVMValueRef tes_v; - - /* HW GS */ - /* On gfx10: - * - bits 0..10: ordered_wave_id - * - bits 12..20: number of vertices in group - * - bits 22..30: number of primitives in group - */ - LLVMValueRef gs_tg_info; - LLVMValueRef gs2vs_offset; + LLVMValueRef gs_wave_id; LLVMValueRef gs_vtx_offset[6]; @@ -87,19 +68,10 @@ struct radv_shader_context { LLVMValueRef hs_ring_tess_offchip; LLVMValueRef hs_ring_tess_factor; - /* Streamout */ - LLVMValueRef streamout_buffers; - LLVMValueRef streamout_write_idx; - LLVMValueRef streamout_config; - LLVMValueRef streamout_offset[4]; - - gl_shader_stage stage; - LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; uint64_t output_mask; - bool is_gs_copy_shader; LLVMValueRef gs_next_vertex[4]; LLVMValueRef gs_curprim_verts[4]; LLVMValueRef gs_generated_prims[4]; @@ -119,14 +91,6 @@ struct radv_shader_output_values { unsigned usage_mask; }; -enum radeon_llvm_calling_convention { - RADEON_LLVM_AMDGPU_VS = 87, - RADEON_LLVM_AMDGPU_GS = 88, - RADEON_LLVM_AMDGPU_PS = 89, - RADEON_LLVM_AMDGPU_CS = 90, - RADEON_LLVM_AMDGPU_HS = 93, -}; - static inline struct radv_shader_context * radv_shader_context_from_abi(struct ac_shader_abi *abi) { @@ -138,96 +102,17 @@ static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx) { switch (ctx->stage) { case MESA_SHADER_TESS_CTRL: - return ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); + return ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids), + 0, 8); case MESA_SHADER_TESS_EVAL: - return ctx->tes_rel_patch_id; + return ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id); break; default: unreachable("Illegal stage"); } } -static unsigned -get_tcs_num_patches(struct radv_shader_context *ctx) -{ - unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; - uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); - uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; - uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - unsigned num_patches; - unsigned hardware_lds_size; - - /* Ensure that we only need one wave per SIMD so we don't need to check - * resource usage. Also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256. - */ - num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4; - /* Make sure that the data fits in LDS. This assumes the shaders only - * use LDS for the inputs and outputs. - */ - hardware_lds_size = 32768; - - /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single - * threadgroup, even though there is more than 32 KiB LDS. - * - * Test: dEQP-VK.tessellation.shader_input_output.barrier - */ - if (ctx->options->chip_class >= GFX7 && ctx->options->family != CHIP_STONEY) - hardware_lds_size = 65536; - - num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); - /* Make sure the output data fits in the offchip buffer */ - num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size); - /* Not necessary for correctness, but improves performance. The - * specific value is taken from the proprietary driver. - */ - num_patches = MIN2(num_patches, 40); - - /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ - if (ctx->options->chip_class == GFX6) { - unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); - num_patches = MIN2(num_patches, one_wave); - } - return num_patches; -} - -static unsigned -calculate_tess_lds_size(struct radv_shader_context *ctx) -{ - unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp; - unsigned num_tcs_outputs, num_tcs_patch_outputs; - unsigned input_vertex_size, output_vertex_size; - unsigned input_patch_size, output_patch_size; - unsigned pervertex_output_patch_size; - unsigned output_patch0_offset; - unsigned num_patches; - unsigned lds_size; - - num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; - num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); - num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); - - input_vertex_size = ctx->tcs_num_inputs * 16; - output_vertex_size = num_tcs_outputs * 16; - - input_patch_size = num_tcs_input_cp * input_vertex_size; - - pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; - output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - - num_patches = ctx->tcs_num_patches; - output_patch0_offset = input_patch_size * num_patches; - - lds_size = output_patch0_offset + output_patch_size * num_patches; - return lds_size; -} - /* Tessellation shaders pass outputs to the next shader using LDS. * * LS outputs = TCS inputs @@ -251,9 +136,9 @@ calculate_tess_lds_size(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_in_patch_stride(struct radv_shader_context *ctx) { - assert (ctx->stage == MESA_SHADER_TESS_CTRL); + assert(ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; input_patch_size /= 4; return LLVMConstInt(ctx->ac.i32, input_patch_size, false); @@ -262,8 +147,8 @@ get_tcs_in_patch_stride(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_out_patch_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; @@ -274,7 +159,7 @@ get_tcs_out_patch_stride(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_out_vertex_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; output_vertex_size /= 4; return LLVMConstInt(ctx->ac.i32, output_vertex_size, false); @@ -285,7 +170,7 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx) { assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; unsigned num_patches = ctx->tcs_num_patches; @@ -299,10 +184,10 @@ get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx) { assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; unsigned num_patches = ctx->tcs_num_patches; @@ -345,87 +230,16 @@ get_tcs_out_current_patch_data_offset(struct radv_shader_context *ctx) patch0_patch_data_offset); } -#define MAX_ARGS 64 -struct arg_info { - LLVMTypeRef types[MAX_ARGS]; - LLVMValueRef *assign[MAX_ARGS]; - uint8_t count; - uint8_t sgpr_count; - uint8_t num_sgprs_used; - uint8_t num_vgprs_used; -}; - -enum radv_arg_regfile { - ARG_SGPR, - ARG_VGPR, -}; - -static void -add_arg(struct arg_info *info, enum radv_arg_regfile regfile, LLVMTypeRef type, - LLVMValueRef *param_ptr) -{ - assert(info->count < MAX_ARGS); - - info->assign[info->count] = param_ptr; - info->types[info->count] = type; - info->count++; - - if (regfile == ARG_SGPR) { - info->num_sgprs_used += ac_get_type_size(type) / 4; - info->sgpr_count++; - } else { - assert(regfile == ARG_VGPR); - info->num_vgprs_used += ac_get_type_size(type) / 4; - } -} - -static void assign_arguments(LLVMValueRef main_function, - struct arg_info *info) -{ - unsigned i; - for (i = 0; i < info->count; i++) { - if (info->assign[i]) - *info->assign[i] = LLVMGetParam(main_function, i); - } -} - static LLVMValueRef -create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, - LLVMBuilderRef builder, LLVMTypeRef *return_types, - unsigned num_return_elems, - struct arg_info *args, +create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, + LLVMBuilderRef builder, + const struct ac_shader_args *args, + enum ac_llvm_calling_convention convention, unsigned max_workgroup_size, const struct radv_nir_compiler_options *options) { - LLVMTypeRef main_function_type, ret_type; - LLVMBasicBlockRef main_function_body; - - if (num_return_elems) - ret_type = LLVMStructTypeInContext(ctx, return_types, - num_return_elems, true); - else - ret_type = LLVMVoidTypeInContext(ctx); - - /* Setup the function */ - main_function_type = - LLVMFunctionType(ret_type, args->types, args->count, 0); LLVMValueRef main_function = - LLVMAddFunction(module, "main", main_function_type); - main_function_body = - LLVMAppendBasicBlockInContext(ctx, main_function, "main_body"); - LLVMPositionBuilderAtEnd(builder, main_function_body); - - LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS); - for (unsigned i = 0; i < args->sgpr_count; ++i) { - LLVMValueRef P = LLVMGetParam(main_function, i); - - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG); - - if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); - ac_add_attr_dereferenceable(P, UINT64_MAX); - } - } + ac_build_main(args, ctx, convention, "main", ctx->voidt, module); if (options->address32_hi) { ac_llvm_add_target_dep_function_attr(main_function, @@ -438,451 +252,54 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, return main_function; } - -static void -set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, - uint8_t num_sgprs) -{ - ud_info->sgpr_idx = *sgpr_idx; - ud_info->num_sgprs = num_sgprs; - *sgpr_idx += num_sgprs; -} - -static void -set_loc_shader(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx, - uint8_t num_sgprs) -{ - struct radv_userdata_info *ud_info = - &ctx->shader_info->user_sgprs_locs.shader_data[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, num_sgprs); -} - -static void -set_loc_shader_ptr(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) -{ - bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; - - set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); -} - -static void -set_loc_desc(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) -{ - struct radv_userdata_locations *locs = - &ctx->shader_info->user_sgprs_locs; - struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, 1); - - locs->descriptor_sets_enabled |= 1 << idx; -} - -struct user_sgpr_info { - bool need_ring_offsets; - bool indirect_all_descriptor_sets; - uint8_t remaining_sgprs; -}; - -static bool needs_view_index_sgpr(struct radv_shader_context *ctx, - gl_shader_stage stage) -{ - switch (stage) { - case MESA_SHADER_VERTEX: - if (ctx->shader_info->needs_multiview_view_index || - (!ctx->options->key.vs_common_out.as_es && !ctx->options->key.vs_common_out.as_ls && ctx->options->key.has_multiview_view_index)) - return true; - break; - case MESA_SHADER_TESS_EVAL: - if (ctx->shader_info->needs_multiview_view_index || (!ctx->options->key.vs_common_out.as_es && ctx->options->key.has_multiview_view_index)) - return true; - break; - case MESA_SHADER_GEOMETRY: - case MESA_SHADER_TESS_CTRL: - if (ctx->shader_info->needs_multiview_view_index) - return true; - break; - default: - break; - } - return false; -} - -static uint8_t -count_vs_user_sgprs(struct radv_shader_context *ctx) -{ - uint8_t count = 0; - - if (ctx->shader_info->vs.has_vertex_buffers) - count++; - count += ctx->shader_info->vs.needs_draw_id ? 3 : 2; - - return count; -} - -static void allocate_inline_push_consts(struct radv_shader_context *ctx, - struct user_sgpr_info *user_sgpr_info) -{ - uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; - - /* Only supported if shaders use push constants. */ - if (ctx->shader_info->min_push_constant_used == UINT8_MAX) - return; - - /* Only supported if shaders don't have indirect push constants. */ - if (ctx->shader_info->has_indirect_push_constants) - return; - - /* Only supported for 32-bit push constants. */ - if (!ctx->shader_info->has_only_32bit_push_constants) - return; - - uint8_t num_push_consts = - (ctx->shader_info->max_push_constant_used - - ctx->shader_info->min_push_constant_used) / 4; - - /* Check if the number of user SGPRs is large enough. */ - if (num_push_consts < remaining_sgprs) { - ctx->shader_info->num_inline_push_consts = num_push_consts; - } else { - ctx->shader_info->num_inline_push_consts = remaining_sgprs; - } - - /* Clamp to the maximum number of allowed inlined push constants. */ - if (ctx->shader_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) - ctx->shader_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; - - if (ctx->shader_info->num_inline_push_consts == num_push_consts && - !ctx->shader_info->loads_dynamic_offsets) { - /* Disable the default push constants path if all constants are - * inlined and if shaders don't use dynamic descriptors. - */ - ctx->shader_info->loads_push_constants = false; - } - - ctx->shader_info->base_inline_push_consts = - ctx->shader_info->min_push_constant_used / 4; -} - -static void allocate_user_sgprs(struct radv_shader_context *ctx, - gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage, - bool needs_view_index, - struct user_sgpr_info *user_sgpr_info) -{ - uint8_t user_sgpr_count = 0; - - memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info)); - - /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */ - if (stage == MESA_SHADER_GEOMETRY || - stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_CTRL || - stage == MESA_SHADER_TESS_EVAL || - ctx->is_gs_copy_shader) - user_sgpr_info->need_ring_offsets = true; - - if (stage == MESA_SHADER_FRAGMENT && - ctx->shader_info->ps.needs_sample_positions) - user_sgpr_info->need_ring_offsets = true; - - /* 2 user sgprs will nearly always be allocated for scratch/rings */ - if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) { - user_sgpr_count += 2; - } - - switch (stage) { - case MESA_SHADER_COMPUTE: - if (ctx->shader_info->cs.uses_grid_size) - user_sgpr_count += 3; - break; - case MESA_SHADER_FRAGMENT: - user_sgpr_count += ctx->shader_info->ps.needs_sample_positions; - break; - case MESA_SHADER_VERTEX: - if (!ctx->is_gs_copy_shader) - user_sgpr_count += count_vs_user_sgprs(ctx); - break; - case MESA_SHADER_TESS_CTRL: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) - user_sgpr_count += count_vs_user_sgprs(ctx); - } - break; - case MESA_SHADER_TESS_EVAL: - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) { - user_sgpr_count += count_vs_user_sgprs(ctx); - } - } - break; - default: - break; - } - - if (needs_view_index) - user_sgpr_count++; - - if (ctx->shader_info->loads_push_constants) - user_sgpr_count++; - - if (ctx->shader_info->so.num_outputs) - user_sgpr_count++; - - uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; - uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; - uint32_t num_desc_set = - util_bitcount(ctx->shader_info->desc_set_used_mask); - - if (remaining_sgprs < num_desc_set) { - user_sgpr_info->indirect_all_descriptor_sets = true; - user_sgpr_info->remaining_sgprs = remaining_sgprs - 1; - } else { - user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set; - } - - allocate_inline_push_consts(ctx, user_sgpr_info); -} - static void -declare_global_input_sgprs(struct radv_shader_context *ctx, - const struct user_sgpr_info *user_sgpr_info, - struct arg_info *args, - LLVMValueRef *desc_sets) +load_descriptor_sets(struct radv_shader_context *ctx) { - LLVMTypeRef type = ac_array_in_const32_addr_space(ctx->ac.i8); - - /* 1 for each descriptor set */ - if (!user_sgpr_info->indirect_all_descriptor_sets) { - uint32_t mask = ctx->shader_info->desc_set_used_mask; - + uint32_t mask = ctx->args->shader_info->desc_set_used_mask; + if (ctx->args->shader_info->need_indirect_descriptor_sets) { + LLVMValueRef desc_sets = + ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]); while (mask) { int i = u_bit_scan(&mask); - add_arg(args, ARG_SGPR, type, &ctx->descriptor_sets[i]); - } - } else { - add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(type), - desc_sets); - } - - if (ctx->shader_info->loads_push_constants) { - /* 1 for push constants and dynamic descriptors */ - add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants); - } - - for (unsigned i = 0; i < ctx->shader_info->num_inline_push_consts; i++) { - add_arg(args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.inline_push_consts[i]); - } - ctx->abi.num_inline_push_consts = ctx->shader_info->num_inline_push_consts; - ctx->abi.base_inline_push_consts = ctx->shader_info->base_inline_push_consts; - - if (ctx->shader_info->so.num_outputs) { - add_arg(args, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->ac.v4i32), - &ctx->streamout_buffers); - } -} - -static void -declare_vs_specific_input_sgprs(struct radv_shader_context *ctx, - gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage, - struct arg_info *args) -{ - if (!ctx->is_gs_copy_shader && - (stage == MESA_SHADER_VERTEX || - (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->vs.has_vertex_buffers) { - add_arg(args, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->ac.v4i32), - &ctx->vertex_buffers); - } - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex); - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance); - if (ctx->shader_info->vs.needs_draw_id) { - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.draw_id); - } - } -} - -static void -declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) -{ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id); - if (!ctx->is_gs_copy_shader) { - if (ctx->options->key.vs_common_out.as_ls) { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id); - if (ctx->ac.chip_class >= GFX10) { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - } - } else { - if (ctx->ac.chip_class >= GFX10) { - if (ctx->options->key.vs_common_out.as_ngg) { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - } - } - } -} - -static void -declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, - struct arg_info *args) -{ - int i; - - if (ctx->options->use_ngg_streamout) - return; - - /* Streamout SGPRs. */ - if (ctx->shader_info->so.num_outputs) { - assert(stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL); - - if (stage != MESA_SHADER_TESS_EVAL) { - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_config); - } else { - args->assign[args->count - 1] = &ctx->streamout_config; - args->types[args->count - 1] = ctx->ac.i32; - } - - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_write_idx); - } - - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (i = 0; i < 4; i++) { - if (!ctx->shader_info->so.strides[i]) - continue; - - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_offset[i]); - } -} - -static void -declare_tes_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) -{ - add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_u); - add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_v); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_rel_patch_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id); -} - -static void -set_global_input_locs(struct radv_shader_context *ctx, - const struct user_sgpr_info *user_sgpr_info, - LLVMValueRef desc_sets, uint8_t *user_sgpr_idx) -{ - uint32_t mask = ctx->shader_info->desc_set_used_mask; - - if (!user_sgpr_info->indirect_all_descriptor_sets) { - while (mask) { - int i = u_bit_scan(&mask); + ctx->descriptor_sets[i] = + ac_build_load_to_sgpr(&ctx->ac, desc_sets, + LLVMConstInt(ctx->ac.i32, i, false)); - set_loc_desc(ctx, i, user_sgpr_idx); } } else { - set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, - user_sgpr_idx); - while (mask) { int i = u_bit_scan(&mask); ctx->descriptor_sets[i] = - ac_build_load_to_sgpr(&ctx->ac, desc_sets, - LLVMConstInt(ctx->ac.i32, i, false)); - - } - - ctx->shader_info->need_indirect_descriptor_sets = true; - } - - if (ctx->shader_info->loads_push_constants) { - set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); - } - - if (ctx->shader_info->num_inline_push_consts) { - set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, - ctx->shader_info->num_inline_push_consts); - } - - if (ctx->streamout_buffers) { - set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, - user_sgpr_idx); - } -} - -static void -set_vs_specific_input_locs(struct radv_shader_context *ctx, - gl_shader_stage stage, bool has_previous_stage, - gl_shader_stage previous_stage, - uint8_t *user_sgpr_idx) -{ - if (!ctx->is_gs_copy_shader && - (stage == MESA_SHADER_VERTEX || - (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->vs.has_vertex_buffers) { - set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, - user_sgpr_idx); + ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]); } - - unsigned vs_num = 2; - if (ctx->shader_info->vs.needs_draw_id) - vs_num++; - - set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, - user_sgpr_idx, vs_num); } } -static void set_llvm_calling_convention(LLVMValueRef func, - gl_shader_stage stage) +static enum ac_llvm_calling_convention +get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage) { - enum radeon_llvm_calling_convention calling_conv; - switch (stage) { case MESA_SHADER_VERTEX: case MESA_SHADER_TESS_EVAL: - calling_conv = RADEON_LLVM_AMDGPU_VS; + return AC_LLVM_AMDGPU_VS; break; case MESA_SHADER_GEOMETRY: - calling_conv = RADEON_LLVM_AMDGPU_GS; + return AC_LLVM_AMDGPU_GS; break; case MESA_SHADER_TESS_CTRL: - calling_conv = RADEON_LLVM_AMDGPU_HS; + return AC_LLVM_AMDGPU_HS; break; case MESA_SHADER_FRAGMENT: - calling_conv = RADEON_LLVM_AMDGPU_PS; + return AC_LLVM_AMDGPU_PS; break; case MESA_SHADER_COMPUTE: - calling_conv = RADEON_LLVM_AMDGPU_CS; + return AC_LLVM_AMDGPU_CS; break; default: unreachable("Unhandle shader type"); } - - LLVMSetFunctionCallConv(func, calling_conv); } /* Returns whether the stage is a stage that can be directly before the GS */ @@ -893,331 +310,37 @@ static bool is_pre_gs_stage(gl_shader_stage stage) static void create_function(struct radv_shader_context *ctx, gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage) + bool has_previous_stage) { - uint8_t user_sgpr_idx; - struct user_sgpr_info user_sgpr_info; - struct arg_info args = {}; - LLVMValueRef desc_sets; - bool needs_view_index = needs_view_index_sgpr(ctx, stage); - if (ctx->ac.chip_class >= GFX10) { - if (is_pre_gs_stage(stage) && ctx->options->key.vs_common_out.as_ngg) { + if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) { /* On GFX10, VS is merged into GS for NGG. */ - previous_stage = stage; stage = MESA_SHADER_GEOMETRY; has_previous_stage = true; } } - allocate_user_sgprs(ctx, stage, has_previous_stage, - previous_stage, needs_view_index, &user_sgpr_info); - - if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) { - add_arg(&args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32), - &ctx->ring_offsets); - } - - switch (stage) { - case MESA_SHADER_COMPUTE: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (ctx->shader_info->cs.uses_grid_size) { - add_arg(&args, ARG_SGPR, ctx->ac.v3i32, - &ctx->abi.num_work_groups); - } - - for (int i = 0; i < 3; i++) { - ctx->abi.workgroup_ids[i] = NULL; - if (ctx->shader_info->cs.uses_block_id[i]) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.workgroup_ids[i]); - } - } - - if (ctx->shader_info->cs.uses_local_invocation_idx) - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.tg_size); - add_arg(&args, ARG_VGPR, ctx->ac.v3i32, - &ctx->abi.local_invocation_ids); - break; - case MESA_SHADER_VERTEX: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage, - previous_stage, &args); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - if (ctx->options->key.vs_common_out.as_es) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->es2gs_offset); - } else if (ctx->options->key.vs_common_out.as_ls) { - /* no extra parameters */ - } else { - declare_streamout_sgprs(ctx, stage, &args); - } - - declare_vs_input_vgprs(ctx, &args); - break; - case MESA_SHADER_TESS_CTRL: - if (has_previous_stage) { - // First 6 system regs - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->merged_wave_info); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->tess_factor_offset); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - declare_vs_specific_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, &args); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_patch_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_rel_ids); - - declare_vs_input_vgprs(ctx, &args); - } else { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->tess_factor_offset); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_patch_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_rel_ids); - } - break; - case MESA_SHADER_TESS_EVAL: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - if (ctx->options->key.vs_common_out.as_es) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->es2gs_offset); - } else { - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); - declare_streamout_sgprs(ctx, stage, &args); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - } - declare_tes_input_vgprs(ctx, &args); - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - // First 6 system regs - if (ctx->options->key.vs_common_out.as_ngg) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs_tg_info); - } else { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs2vs_offset); - } - - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->merged_wave_info); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (previous_stage != MESA_SHADER_TESS_EVAL) { - declare_vs_specific_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, - &args); - } - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[0]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[2]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_prim_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_invocation_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[4]); - - if (previous_stage == MESA_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &args); - } else { - declare_tes_input_vgprs(ctx, &args); - } - } else { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs2vs_offset); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs_wave_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[0]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[1]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_prim_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[2]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[3]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[4]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[5]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_invocation_id); - } - break; - case MESA_SHADER_FRAGMENT: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_centroid); - add_arg(&args, ARG_VGPR, ctx->ac.v3i32, NULL); /* persp pull model */ - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_centroid); - add_arg(&args, ARG_VGPR, ctx->ac.f32, NULL); /* line stipple tex */ - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[0]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[1]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[2]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[3]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.front_face); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.ancillary); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.sample_coverage); - add_arg(&args, ARG_VGPR, ctx->ac.i32, NULL); /* fixed pt */ - break; - default: - unreachable("Shader stage not implemented"); - } - ctx->main_function = create_llvm_function( - ctx->context, ctx->ac.module, ctx->ac.builder, NULL, 0, &args, - ctx->max_workgroup_size, ctx->options); - set_llvm_calling_convention(ctx->main_function, stage); - - - ctx->shader_info->num_input_vgprs = 0; - ctx->shader_info->num_input_sgprs = ctx->options->supports_spill ? 2 : 0; - - ctx->shader_info->num_input_sgprs += args.num_sgprs_used; - - if (ctx->stage != MESA_SHADER_FRAGMENT) - ctx->shader_info->num_input_vgprs = args.num_vgprs_used; + &ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac, + get_llvm_calling_convention(ctx->main_function, stage), + ctx->max_workgroup_size, + ctx->args->options); - assign_arguments(ctx->main_function, &args); + ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", + LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), + NULL, 0, AC_FUNC_ATTR_READNONE); + ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, + ac_array_in_const_addr_space(ctx->ac.v4i32), ""); - user_sgpr_idx = 0; - - if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) { - set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, - &user_sgpr_idx); - if (ctx->options->supports_spill) { - ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", - LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), - NULL, 0, AC_FUNC_ATTR_READNONE); - ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, - ac_array_in_const_addr_space(ctx->ac.v4i32), ""); - } - } - - /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including - * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */ - if (has_previous_stage) - user_sgpr_idx = 0; - - set_global_input_locs(ctx, &user_sgpr_info, desc_sets, &user_sgpr_idx); - - switch (stage) { - case MESA_SHADER_COMPUTE: - if (ctx->shader_info->cs.uses_grid_size) { - set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, - &user_sgpr_idx, 3); - } - break; - case MESA_SHADER_VERTEX: - set_vs_specific_input_locs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_idx); - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_TESS_CTRL: - set_vs_specific_input_locs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_idx); - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_TESS_EVAL: - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) - set_vs_specific_input_locs(ctx, stage, - has_previous_stage, - previous_stage, - &user_sgpr_idx); - } - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_FRAGMENT: - break; - default: - unreachable("Shader stage not implemented"); - } + load_descriptor_sets(ctx); if (stage == MESA_SHADER_TESS_CTRL || - (stage == MESA_SHADER_VERTEX && ctx->options->key.vs_common_out.as_ls) || + (stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) || /* GFX9 has the ESGS ring buffer in LDS. */ (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) { ac_declare_lds_as_pointer(&ctx->ac); } - ctx->shader_info->num_user_sgprs = user_sgpr_idx; } @@ -1227,7 +350,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set]; - struct radv_pipeline_layout *pipeline_layout = ctx->options->layout; + struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout; struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; unsigned base_offset = layout->binding[binding].offset; LLVMValueRef offset, stride; @@ -1236,7 +359,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; - desc_ptr = ctx->abi.push_constants; + desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants); base_offset = pipeline_layout->push_constant_size + 16 * idx; stride = LLVMConstInt(ctx->ac.i32, 16, false); } else @@ -1260,7 +383,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, if (ctx->ac.chip_class >= GFX10) { desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -1269,7 +392,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, LLVMValueRef desc_components[4] = { LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""), - LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false), + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi), false), /* High limit to support variable sizes. */ LLVMConstInt(ctx->ac.i32, 0xffffffff, false), LLVMConstInt(ctx->ac.i32, desc_type, false), @@ -1305,9 +428,9 @@ static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context *ctx) uint32_t num_patches = ctx->tcs_num_patches; uint32_t num_tcs_outputs; if (ctx->stage == MESA_SHADER_TESS_CTRL) - num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); else - num_tcs_outputs = ctx->options->key.tes.tcs_num_outputs; + num_tcs_outputs = ctx->args->options->key.tes.tcs_num_outputs; uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; @@ -1476,6 +599,7 @@ store_tcs_output(struct ac_shader_abi *abi, LLVMValueRef dw_addr; LLVMValueRef stride = NULL; LLVMValueRef buf_addr = NULL; + LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds); unsigned param; bool store_lds = true; @@ -1532,14 +656,14 @@ store_tcs_output(struct ac_shader_abi *abi, if (!is_tess_factor && writemask != 0xF) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1, - buf_addr, ctx->oc_lds, - 4 * (base + chan), ac_glc, false); + buf_addr, oc_lds, + 4 * (base + chan), ac_glc); } if (writemask == 0xF) { ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4, - buf_addr, ctx->oc_lds, - (base * 4), ac_glc, false); + buf_addr, oc_lds, + (base * 4), ac_glc); } } @@ -1560,6 +684,7 @@ load_tes_input(struct ac_shader_abi *abi, struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef buf_addr; LLVMValueRef result; + LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds); unsigned param = shader_io_get_unique_index(location); if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) { @@ -1578,7 +703,7 @@ load_tes_input(struct ac_shader_abi *abi, buf_addr = LLVMBuildAdd(ctx->ac.builder, buf_addr, comp_offset, ""); result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL, - buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false); + buf_addr, oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false); result = ac_trim_vector(&ctx->ac, result, num_components); return result; } @@ -1670,13 +795,6 @@ load_gs_input(struct ac_shader_abi *abi, return result; } - -static void radv_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible) -{ - struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - ac_build_kill_if_false(&ctx->ac, visible); -} - static uint32_t radv_get_sample_pos_offset(uint32_t num_samples) { @@ -1711,7 +829,7 @@ static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, ac_array_in_const_addr_space(ctx->ac.v2f32), ""); uint32_t sample_pos_offset = - radv_get_sample_pos_offset(ctx->options->key.fs.num_samples); + radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples); sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id, @@ -1727,11 +845,11 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); uint8_t log2_ps_iter_samples; - if (ctx->shader_info->ps.force_persample) { + if (ctx->args->shader_info->ps.force_persample) { log2_ps_iter_samples = - util_logbase2(ctx->options->key.fs.num_samples); + util_logbase2(ctx->args->options->key.fs.num_samples); } else { - log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples; + log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples; } /* The bit pattern matches that used by fixed function fragment @@ -1748,53 +866,36 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples]; LLVMValueRef result, sample_id; - sample_id = ac_unpack_param(&ctx->ac, abi->ancillary, 8, 4); + sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4); sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, ""); - result = LLVMBuildAnd(ctx->ac.builder, sample_id, abi->sample_coverage, ""); + result = LLVMBuildAnd(ctx->ac.builder, sample_id, + ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), ""); return result; } static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs); static void -visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs) +visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs) { - LLVMValueRef gs_next_vertex; - LLVMValueRef can_emit; unsigned offset = 0; struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - if (ctx->options->key.vs_common_out.as_ngg) { - gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); + if (ctx->args->options->key.vs_common_out.as_ngg) { + gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs); return; } - /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, - ctx->gs_next_vertex[stream], - ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, don't emit any more: excessive vertex emissions are not - * supposed to have any effect. - */ - can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - - bool use_kill = !ctx->shader_info->gs.writes_memory; - if (use_kill) - ac_build_kill_if_false(&ctx->ac, can_emit); - else - ac_build_ifcc(&ctx->ac, can_emit, 6505); - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - ctx->shader_info->gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; int length = util_last_bit(output_usage_mask); @@ -1814,7 +915,7 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr offset++; - voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); + voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, ""); voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), ""); out_val = ac_to_integer(&ctx->ac, out_val); @@ -1823,21 +924,16 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, - voffset, ctx->gs2vs_offset, 0, - ac_glc | ac_slc, true); + voffset, + ac_get_arg(&ctx->ac, + ctx->args->gs2vs_offset), + 0, ac_glc | ac_slc | ac_swizzled); } } - gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, - ctx->ac.i32_1, ""); - LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id); - - if (!use_kill) - ac_build_endif(&ctx->ac, 6505); } static void @@ -1845,7 +941,7 @@ visit_end_primitive(struct ac_shader_abi *abi, unsigned stream) { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - if (ctx->options->key.vs_common_out.as_ngg) { + if (ctx->args->options->key.vs_common_out.as_ngg) { LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); return; } @@ -1859,8 +955,8 @@ load_tess_coord(struct ac_shader_abi *abi) struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef coord[4] = { - ctx->tes_u, - ctx->tes_v, + ac_get_arg(&ctx->ac, ctx->args->tes_u), + ac_get_arg(&ctx->ac, ctx->args->tes_v), ctx->ac.f32_0, ctx->ac.f32_0, }; @@ -1876,13 +972,14 @@ static LLVMValueRef load_patch_vertices_in(struct ac_shader_abi *abi) { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - return LLVMConstInt(ctx->ac.i32, ctx->options->key.tcs.input_vertices, false); + return LLVMConstInt(ctx->ac.i32, ctx->args->options->key.tcs.input_vertices, false); } static LLVMValueRef radv_load_base_vertex(struct ac_shader_abi *abi) { - return abi->base_vertex; + struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); + return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex); } static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi, @@ -1928,7 +1025,7 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef list = ctx->descriptor_sets[descriptor_set]; - struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout; + struct radv_descriptor_set_layout *layout = ctx->args->options->layout->set[descriptor_set].layout; struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index; unsigned offset = binding->offset; unsigned stride = binding->size; @@ -2066,35 +1163,6 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, ""); } -static unsigned -get_num_channels_from_data_format(unsigned data_format) -{ - switch (data_format) { - case V_008F0C_BUF_DATA_FORMAT_8: - case V_008F0C_BUF_DATA_FORMAT_16: - case V_008F0C_BUF_DATA_FORMAT_32: - return 1; - case V_008F0C_BUF_DATA_FORMAT_8_8: - case V_008F0C_BUF_DATA_FORMAT_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32: - return 2; - case V_008F0C_BUF_DATA_FORMAT_10_11_11: - case V_008F0C_BUF_DATA_FORMAT_11_11_10: - case V_008F0C_BUF_DATA_FORMAT_32_32_32: - return 3; - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: - case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: - return 4; - default: - break; - } - - return 4; -} - static LLVMValueRef radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value, @@ -2116,10 +1184,8 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, for (unsigned i = 0; i < num_channels; i++) chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i); } else { - if (num_channels) { - assert(num_channels == 1); - chan[0] = value; - } + assert(num_channels == 1); + chan[0] = value; } for (unsigned i = num_channels; i < 4; i++) { @@ -2134,14 +1200,14 @@ static void handle_vs_input_decl(struct radv_shader_context *ctx, struct nir_variable *variable) { - LLVMValueRef t_list_ptr = ctx->vertex_buffers; + LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->vertex_buffers); LLVMValueRef t_offset; LLVMValueRef t_list; LLVMValueRef input; LLVMValueRef buffer_index; unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); uint8_t input_usage_mask = - ctx->shader_info->vs.input_usage_mask[variable->data.location]; + ctx->args->shader_info->vs.input_usage_mask[variable->data.location]; unsigned num_input_channels = util_last_bit(input_usage_mask); variable->data.driver_location = variable->data.location * 4; @@ -2150,14 +1216,14 @@ handle_vs_input_decl(struct radv_shader_context *ctx, for (unsigned i = 0; i < attrib_count; ++i) { LLVMValueRef output[4]; unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0; - unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index]; + unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index]; unsigned data_format = attrib_format & 0x0f; unsigned num_format = (attrib_format >> 4) & 0x07; bool is_float = num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT; - if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { - uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index]; + if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { + uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index]; if (divisor) { buffer_index = ctx->abi.instance_id; @@ -2170,50 +1236,100 @@ handle_vs_input_decl(struct radv_shader_context *ctx, buffer_index = ctx->ac.i32_0; } - buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, ""); - } else - buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, - ctx->abi.base_vertex, ""); + buffer_index = LLVMBuildAdd(ctx->ac.builder, + ac_get_arg(&ctx->ac, + ctx->args->ac.start_instance),\ + buffer_index, ""); + } else { + buffer_index = LLVMBuildAdd(ctx->ac.builder, + ctx->abi.vertex_id, + ac_get_arg(&ctx->ac, + ctx->args->ac.base_vertex), ""); + } + + const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format); /* Adjust the number of channels to load based on the vertex * attribute format. */ - unsigned num_format_channels = get_num_channels_from_data_format(data_format); - unsigned num_channels = MIN2(num_input_channels, num_format_channels); - unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index]; - unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index]; - unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index]; + unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels); + unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index]; + unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index]; + unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index]; - if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) { /* Always load, at least, 3 channels for formats that * need to be shuffled because X<->Z. */ num_channels = MAX2(num_channels, 3); } - if (attrib_stride != 0 && attrib_offset > attrib_stride) { - LLVMValueRef buffer_offset = - LLVMConstInt(ctx->ac.i32, - attrib_offset / attrib_stride, false); + t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); + t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); - buffer_index = LLVMBuildAdd(ctx->ac.builder, - buffer_index, - buffer_offset, ""); + /* Perform per-channel vertex fetch operations if unaligned + * access are detected. Only GFX6 and GFX10 are affected. + */ + bool unaligned_vertex_fetches = false; + if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class == GFX10) && + vtx_info->chan_format != data_format && + ((attrib_offset % vtx_info->element_size) || + (attrib_stride % vtx_info->element_size))) + unaligned_vertex_fetches = true; + + if (unaligned_vertex_fetches) { + unsigned chan_format = vtx_info->chan_format; + LLVMValueRef values[4]; - attrib_offset = attrib_offset % attrib_stride; - } + assert(ctx->ac.chip_class == GFX6 || + ctx->ac.chip_class == GFX10); - t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + for (unsigned chan = 0; chan < num_channels; chan++) { + unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size; + LLVMValueRef chan_index = buffer_index; + + if (attrib_stride != 0 && chan_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + chan_offset / attrib_stride, false); - input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, - buffer_index, - LLVMConstInt(ctx->ac.i32, attrib_offset, false), - ctx->ac.i32_0, ctx->ac.i32_0, - num_channels, - data_format, num_format, 0, true); + chan_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); - if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + chan_offset = chan_offset % attrib_stride; + } + + values[chan] = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + chan_index, + LLVMConstInt(ctx->ac.i32, chan_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, 1, + chan_format, num_format, 0, true); + } + + input = ac_build_gather_values(&ctx->ac, values, num_channels); + } else { + if (attrib_stride != 0 && attrib_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + attrib_offset / attrib_stride, false); + + buffer_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); + + attrib_offset = attrib_offset % attrib_stride; + } + + input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + buffer_index, + LLVMConstInt(ctx->ac.i32, attrib_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, + num_channels, + data_format, num_format, 0, true); + } + + if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) { LLVMValueRef c[4]; c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2); c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1); @@ -2235,7 +1351,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx, } } - unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3; + unsigned alpha_adjust = (ctx->args->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3; output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]); for (unsigned chan = 0; chan < 4; chan++) { @@ -2272,10 +1388,21 @@ prepare_interp_optimize(struct radv_shader_context *ctx, uses_center = true; } + ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid); + ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid); + if (uses_center && uses_centroid) { - LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ctx->abi.prim_mask, ctx->ac.i32_0, ""); - ctx->abi.persp_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->abi.persp_center, ctx->abi.persp_centroid, ""); - ctx->abi.linear_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->abi.linear_center, ctx->abi.linear_centroid, ""); + LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, + ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask), + ctx->ac.i32_0, ""); + ctx->abi.persp_centroid = + LLVMBuildSelect(ctx->ac.builder, sel, + ac_get_arg(&ctx->ac, ctx->args->ac.persp_center), + ctx->abi.persp_centroid, ""); + ctx->abi.linear_centroid = + LLVMBuildSelect(ctx->ac.builder, sel, + ac_get_arg(&ctx->ac, ctx->args->ac.linear_center), + ctx->abi.linear_centroid, ""); } } @@ -2339,9 +1466,9 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { unsigned index = target - V_008DFC_SQ_EXP_MRT; - unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; - bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; - bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1; + unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf; + bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1; + bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1; unsigned chan; LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; @@ -2537,7 +1664,7 @@ radv_emit_stream_output(struct radv_shader_context *ctx, ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf], ctx->ac.i32_0, offset, - ac_glc | ac_slc, false); + ac_glc | ac_slc); } static void @@ -2546,9 +1673,10 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) int i; /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ - assert(ctx->streamout_config); + assert(ctx->args->streamout_config.used); LLVMValueRef so_vtx_count = - ac_build_bfe(&ctx->ac, ctx->streamout_config, + ac_build_bfe(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->streamout_config), LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false); @@ -2569,7 +1697,8 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) * (streamout_write_index + thread_id)*stride[buffer_id] + * attrib_offset */ - LLVMValueRef so_write_index = ctx->streamout_write_idx; + LLVMValueRef so_write_index = + ac_get_arg(&ctx->ac, ctx->args->streamout_write_idx); /* Compute (streamout_write_index + thread_id). */ so_write_index = @@ -2580,10 +1709,10 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) */ LLVMValueRef so_write_offset[4] = {}; LLVMValueRef so_buffers[4] = {}; - LLVMValueRef buf_ptr = ctx->streamout_buffers; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers); for (i = 0; i < 4; i++) { - uint16_t stride = ctx->shader_info->so.strides[i]; + uint16_t stride = ctx->args->shader_info->so.strides[i]; if (!stride) continue; @@ -2594,7 +1723,8 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - LLVMValueRef so_offset = ctx->streamout_offset[i]; + LLVMValueRef so_offset = + ac_get_arg(&ctx->ac, ctx->args->streamout_offset[i]); so_offset = LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), ""); @@ -2607,10 +1737,10 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) } /* Write streamout data. */ - for (i = 0; i < ctx->shader_info->so.num_outputs; i++) { + for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) { struct radv_shader_output_values shader_out = {}; struct radv_stream_output *output = - &ctx->shader_info->so.outputs[i]; + &ctx->args->shader_info->so.outputs[i]; if (stream != output->stream) continue; @@ -2642,6 +1772,7 @@ radv_build_param_exports(struct radv_shader_context *ctx, if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID && + slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 && slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0) @@ -2735,7 +1866,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, if (outinfo->writes_layer == true) pos_args[1].out[2] = layer_value; if (outinfo->writes_viewport_index == true) { - if (ctx->options->chip_class >= GFX9) { + if (ctx->args->options->chip_class >= GFX9) { /* GFX9 has the layer in out.z[10:0] and the viewport * index in out.z[19:16]. */ @@ -2761,12 +1892,10 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, outinfo->pos_exports++; } - /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. * Setting valid_mask=1 prevents it and has no other effect. */ - if (ctx->ac.family == CHIP_NAVI10 || - ctx->ac.family == CHIP_NAVI12 || - ctx->ac.family == CHIP_NAVI14) + if (ctx->ac.chip_class == GFX10) pos_args[0].valid_mask = 1; pos_idx = 0; @@ -2797,7 +1926,7 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs; unsigned noutput = 0; - if (ctx->options->key.has_multiview_view_index) { + if (ctx->args->options->key.has_multiview_view_index) { LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; if(!*tmp_out) { for(unsigned i = 0; i < 4; ++i) @@ -2805,7 +1934,8 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); } - LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, ctx->abi.view_index), *tmp_out); + LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index); + LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out); ctx->output_mask |= 1ull << VARYING_SLOT_LAYER; } @@ -2813,9 +1943,9 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, sizeof(outinfo->vs_output_param_offset)); outinfo->pos_exports = 0; - if (!ctx->options->use_ngg_streamout && - ctx->shader_info->so.num_outputs && - !ctx->is_gs_copy_shader) { + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs && + !ctx->args->is_gs_copy_shader) { /* The GS copy shader emission already emits streamout. */ radv_emit_streamout(ctx, 0); } @@ -2832,16 +1962,16 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; if (ctx->stage == MESA_SHADER_VERTEX && - !ctx->is_gs_copy_shader) { + !ctx->args->is_gs_copy_shader) { outputs[noutput].usage_mask = - ctx->shader_info->vs.output_usage_mask[i]; + ctx->args->shader_info->vs.output_usage_mask[i]; } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { outputs[noutput].usage_mask = - ctx->shader_info->tes.output_usage_mask[i]; + ctx->args->shader_info->tes.output_usage_mask[i]; } else { - assert(ctx->is_gs_copy_shader); + assert(ctx->args->is_gs_copy_shader); outputs[noutput].usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; } for (unsigned j = 0; j < 4; j++) { @@ -2857,7 +1987,8 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; - outputs[noutput].values[0] = ctx->vs_prim_id; + outputs[noutput].values[0] = + ac_get_arg(&ctx->ac, ctx->args->vs_prim_id); for (unsigned j = 1; j < 4; j++) outputs[noutput].values[j] = ctx->ac.f32_0; noutput++; @@ -2878,7 +2009,9 @@ handle_es_outputs_post(struct radv_shader_context *ctx, if (ctx->ac.chip_class >= GFX9) { unsigned itemsize_dw = outinfo->esgs_itemsize / 4; LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); - LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); + LLVMValueRef wave_idx = + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4); vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, LLVMBuildMul(ctx->ac.builder, wave_idx, LLVMConstInt(ctx->ac.i32, @@ -2898,11 +2031,11 @@ handle_es_outputs_post(struct radv_shader_context *ctx, if (ctx->stage == MESA_SHADER_VERTEX) { output_usage_mask = - ctx->shader_info->vs.output_usage_mask[i]; + ctx->args->shader_info->vs.output_usage_mask[i]; } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); output_usage_mask = - ctx->shader_info->tes.output_usage_mask[i]; + ctx->args->shader_info->tes.output_usage_mask[i]; } param_index = shader_io_get_unique_index(i); @@ -2932,9 +2065,10 @@ handle_es_outputs_post(struct radv_shader_context *ctx, ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, - NULL, ctx->es2gs_offset, + NULL, + ac_get_arg(&ctx->ac, ctx->args->es2gs_offset), (4 * param_index + j) * 4, - ac_glc | ac_slc, true); + ac_glc | ac_slc | ac_swizzled); } } } @@ -2944,7 +2078,7 @@ static void handle_ls_outputs_post(struct radv_shader_context *ctx) { LLVMValueRef vertex_id = ctx->rel_auto_id; - uint32_t num_tcs_inputs = util_last_bit64(ctx->shader_info->vs.ls_outputs_written); + uint32_t num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false); LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, ""); @@ -2971,12 +2105,13 @@ handle_ls_outputs_post(struct radv_shader_context *ctx) static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx) { - return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); + return ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4); } static LLVMValueRef get_tgsize(struct radv_shader_context *ctx) { - return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 28, 4); + return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 28, 4); } static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx) @@ -2990,7 +2125,7 @@ static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx) static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) { - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), LLVMConstInt(ctx->ac.i32, 12, false), LLVMConstInt(ctx->ac.i32, 9, false), false); @@ -2998,7 +2133,7 @@ static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) { - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), LLVMConstInt(ctx->ac.i32, 22, false), LLVMConstInt(ctx->ac.i32, 9, false), false); @@ -3006,9 +2141,9 @@ static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) static LLVMValueRef ngg_get_ordered_id(struct radv_shader_context *ctx) { - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), ctx->ac.i32_0, - LLVMConstInt(ctx->ac.i32, 11, false), + LLVMConstInt(ctx->ac.i32, 12, false), false); } @@ -3017,7 +2152,7 @@ ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) { unsigned num_outputs = util_bitcount64(ctx->output_mask); - if (ctx->options->key.has_multiview_view_index) + if (ctx->args->options->key.has_multiview_view_index) num_outputs++; LLVMTypeRef elements[2] = { @@ -3094,101 +2229,28 @@ ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, return ngg_gs_vertex_ptr(ctx, vertexidx); } -/* Send GS Alloc Req message from the first wave of the group to SPI. - * Message payload is: - * - bits 0..10: vertices in group - * - bits 12..22: primitives in group - */ -static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx, - LLVMValueRef vtx_cnt, - LLVMValueRef prim_cnt) +static LLVMValueRef +ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned out_idx) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5020); - - tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),""); - tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp); - - ac_build_endif(&ctx->ac, 5020); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_0, /* first struct entry */ + LLVMConstInt(ctx->ac.i32, out_idx, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); } -struct ngg_prim { - unsigned num_vertices; - LLVMValueRef isnull; - LLVMValueRef swap; - LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; -}; - -static void build_export_prim(struct radv_shader_context *ctx, - const struct ngg_prim *prim) +static LLVMValueRef +ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned stream) { - LLVMBuilderRef builder = ctx->ac.builder; - struct ac_export_args args; - LLVMValueRef vertices[3]; - LLVMValueRef odd, even; - LLVMValueRef tmp; - - tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, ""); - args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), ""); - - for (unsigned i = 0; i < prim->num_vertices; ++i) { - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, ""); - tmp = LLVMBuildShl(builder, tmp, - LLVMConstInt(ctx->ac.i32, 9, false), ""); - vertices[i] = LLVMBuildOr(builder, prim->index[i], tmp, ""); - } - - switch (prim->num_vertices) { - case 1: - args.out[0] = LLVMBuildOr(builder, args.out[0], vertices[0], ""); - break; - case 2: - tmp = LLVMBuildShl(builder, vertices[1], - LLVMConstInt(ctx->ac.i32, 10, false), ""); - tmp = LLVMBuildOr(builder, args.out[0], tmp, ""); - args.out[0] = LLVMBuildOr(builder, tmp, vertices[0], ""); - break; - case 3: - /* Swap vertices if needed to follow drawing order. */ - tmp = LLVMBuildShl(builder, vertices[2], - LLVMConstInt(ctx->ac.i32, 20, false), ""); - even = LLVMBuildOr(builder, args.out[0], tmp, ""); - tmp = LLVMBuildShl(builder, vertices[1], - LLVMConstInt(ctx->ac.i32, 10, false), ""); - even = LLVMBuildOr(builder, even, tmp, ""); - even = LLVMBuildOr(builder, even, vertices[0], ""); - - tmp = LLVMBuildShl(builder, vertices[1], - LLVMConstInt(ctx->ac.i32, 20, false), ""); - odd = LLVMBuildOr(builder, args.out[0], tmp, ""); - tmp = LLVMBuildShl(builder, vertices[2], - LLVMConstInt(ctx->ac.i32, 10, false), ""); - odd = LLVMBuildOr(builder, odd, tmp, ""); - odd = LLVMBuildOr(builder, odd, vertices[0], ""); - - args.out[0] = LLVMBuildSelect(builder, prim->swap, odd, even, ""); - break; - default: - unreachable("invalid number of vertices"); - } - - args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, ""); - args.out[1] = LLVMGetUndef(ctx->ac.f32); - args.out[2] = LLVMGetUndef(ctx->ac.f32); - args.out[3] = LLVMGetUndef(ctx->ac.f32); - - args.target = V_008DFC_SQ_EXP_PRIM; - args.enabled_channels = 1; - args.done = true; - args.valid_mask = false; - args.compr = false; - - ac_build_export(&ctx->ac, &args); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second struct entry */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); } static struct radv_stream_output * @@ -3207,7 +2269,7 @@ static void build_streamout_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMValueRef offset_vtx, LLVMValueRef vertexptr) { - struct radv_streamout_info *so = &ctx->shader_info->so; + struct radv_streamout_info *so = &ctx->args->shader_info->so; LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef offset[4] = {}; LLVMValueRef tmp; @@ -3229,9 +2291,9 @@ static void build_streamout_vertex(struct radv_shader_context *ctx, for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - output_stream = ctx->shader_info->gs.output_streams[i]; + output_stream = ctx->args->shader_info->gs.output_streams[i]; if (!(ctx->output_mask & (1ull << i)) || output_stream != stream) @@ -3277,7 +2339,7 @@ static void build_streamout_vertex(struct radv_shader_context *ctx, } else { for (unsigned i = 0; i < so->num_outputs; ++i) { struct radv_stream_output *output = - &ctx->shader_info->so.outputs[i]; + &ctx->args->shader_info->so.outputs[i]; if (stream != output->stream) continue; @@ -3321,9 +2383,9 @@ struct ngg_streamout { static void build_streamout(struct radv_shader_context *ctx, struct ngg_streamout *nggso) { - struct radv_streamout_info *so = &ctx->shader_info->so; + struct radv_streamout_info *so = &ctx->args->shader_info->so; LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef buf_ptr = ctx->streamout_buffers; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers); LLVMValueRef tid = get_thread_id_in_tg(ctx); LLVMValueRef cond, tmp, tmp2; LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false); @@ -3402,7 +2464,7 @@ static void build_streamout(struct radv_shader_context *ctx, unsigned swizzle[4]; int unused_stream = -1; for (unsigned stream = 0; stream < 4; ++stream) { - if (!ctx->shader_info->gs.num_stream_output_components[stream]) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) { unused_stream = stream; break; } @@ -3488,7 +2550,7 @@ static void build_streamout(struct radv_shader_context *ctx, LLVMValueRef emit_vgpr = ctx->ac.i32_0; for (unsigned stream = 0; stream < 4; ++stream) { - if (!ctx->shader_info->gs.num_stream_output_components[stream]) + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) continue; /* Load the number of generated primitives from GDS and @@ -3551,7 +2613,7 @@ static void build_streamout(struct radv_shader_context *ctx, if (isgs) { for (unsigned stream = 0; stream < 4; ++stream) { - if (!ctx->shader_info->gs.num_stream_output_components[stream]) + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) continue; primemit_scan[stream].enable_exclusive = true; @@ -3587,7 +2649,7 @@ static void build_streamout(struct radv_shader_context *ctx, } for (unsigned stream = 0; stream < 4; ++stream) { - if (ctx->shader_info->gs.num_stream_output_components[stream]) { + if (ctx->args->shader_info->gs.num_stream_output_components[stream]) { nggso->emit[stream] = ac_build_readlane( &ctx->ac, scratch_vgpr, LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false)); @@ -3597,7 +2659,7 @@ static void build_streamout(struct radv_shader_context *ctx, /* Write out primitive data */ for (unsigned stream = 0; stream < 4; ++stream) { - if (!ctx->shader_info->gs.num_stream_output_components[stream]) + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) continue; if (isgs) { @@ -3635,8 +2697,8 @@ static unsigned ngg_nogs_vertex_size(struct radv_shader_context *ctx) { unsigned lds_vertex_size = 0; - if (ctx->shader_info->so.num_outputs) - lds_vertex_size = 4 * ctx->shader_info->so.num_outputs + 1; + if (ctx->args->shader_info->so.num_outputs) + lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1; return lds_vertex_size; } @@ -3659,22 +2721,22 @@ static LLVMValueRef ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, static void handle_ngg_outputs_post_1(struct radv_shader_context *ctx) { - struct radv_streamout_info *so = &ctx->shader_info->so; + struct radv_streamout_info *so = &ctx->args->shader_info->so; LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef vertex_ptr = NULL; LLVMValueRef tmp, tmp2; assert((ctx->stage == MESA_SHADER_VERTEX || - ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->is_gs_copy_shader); + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader); - if (!ctx->shader_info->so.num_outputs) + if (!ctx->args->shader_info->so.num_outputs) return; vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); for (unsigned i = 0; i < so->num_outputs; ++i) { struct radv_stream_output *output = - &ctx->shader_info->so.outputs[i]; + &ctx->args->shader_info->so.outputs[i]; unsigned loc = output->location; @@ -3699,18 +2761,20 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) LLVMValueRef tmp; assert((ctx->stage == MESA_SHADER_VERTEX || - ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->is_gs_copy_shader); + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader); - LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); - LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8); + LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8); + LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 0, 8); LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, ""); LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, ""); LLVMValueRef vtxindex[] = { - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16), - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16), - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 0, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 16, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[2]), 0, 16), }; /* Determine the number of vertices per primitive. */ @@ -3720,7 +2784,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) if (ctx->stage == MESA_SHADER_VERTEX) { LLVMValueRef outprim_val = LLVMConstInt(ctx->ac.i32, - ctx->options->key.vs.outprim, false); + ctx->args->options->key.vs.outprim, false); num_vertices_val = LLVMBuildAdd(builder, outprim_val, ctx->ac.i32_1, ""); num_vertices = 3; /* TODO: optimize for points & lines */ @@ -3738,7 +2802,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) } /* Streamout */ - if (ctx->shader_info->so.num_outputs) { + if (ctx->args->shader_info->so.num_outputs) { struct ngg_streamout nggso = {}; nggso.num_vertices = num_vertices_val; @@ -3754,8 +2818,8 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) * to the ES thread of the provoking vertex. */ if (ctx->stage == MESA_SHADER_VERTEX && - ctx->options->key.vs_common_out.export_prim_id) { - if (ctx->shader_info->so.num_outputs) + ctx->args->options->key.vs_common_out.export_prim_id) { + if (ctx->args->shader_info->so.num_outputs) ac_build_s_barrier(&ctx->ac); ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); @@ -3768,24 +2832,18 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) LLVMValueRef provoking_vtx_index = LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); - LLVMBuildStore(builder, ctx->abi.gs_prim_id, + LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id), ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index)); ac_build_endif(&ctx->ac, 5400); } /* TODO: primitive culling */ - build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); /* TODO: streamout queries */ - /* Export primitive data to the index buffer. Format is: - * - bits 0..8: index 0 - * - bit 9: edge flag 0 - * - bits 10..18: index 1 - * - bit 19: edge flag 1 - * - bits 20..28: index 2 - * - bit 29: edge flag 2 - * - bit 31: null primitive (skip) + /* Export primitive data to the index buffer. * * For the first version, we will always build up all three indices * independent of the primitive type. The additional garbage data @@ -3796,20 +2854,24 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) */ ac_build_ifcc(&ctx->ac, is_gs_thread, 6001); { - struct ngg_prim prim = {}; + struct ac_ngg_prim prim = {}; - prim.num_vertices = num_vertices; - prim.isnull = ctx->ac.i1false; - prim.swap = ctx->ac.i1false; - memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); - - for (unsigned i = 0; i < num_vertices; ++i) { - tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id, - LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); - prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) { + prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]); + } else { + prim.num_vertices = num_vertices; + prim.isnull = ctx->ac.i1false; + memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + + for (unsigned i = 0; i < num_vertices; ++i) { + tmp = LLVMBuildLShr(builder, + ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); + prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + } } - build_export_prim(ctx, &prim); + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 6001); @@ -3817,15 +2879,16 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) ac_build_ifcc(&ctx->ac, is_es_thread, 6002); { struct radv_vs_output_info *outinfo = - ctx->stage == MESA_SHADER_TESS_EVAL ? &ctx->shader_info->tes.outinfo : &ctx->shader_info->vs.outinfo; + ctx->stage == MESA_SHADER_TESS_EVAL ? + &ctx->args->shader_info->tes.outinfo : &ctx->args->shader_info->vs.outinfo; /* Exporting the primitive ID is handled below. */ /* TODO: use the new VS export path */ handle_vs_outputs_post(ctx, false, - ctx->options->key.vs_common_out.export_clip_dists, + ctx->args->options->key.vs_common_out.export_clip_dists, outinfo); - if (ctx->options->key.vs_common_out.export_prim_id) { + if (ctx->args->options->key.vs_common_out.export_prim_id) { unsigned param_count = outinfo->param_exports; LLVMValueRef values[4]; @@ -3838,7 +2901,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) values[0] = LLVMBuildLoad(builder, tmp, ""); } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); - values[0] = ctx->abi.tes_patch_id; + values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id); } values[0] = ac_to_float(&ctx->ac, values[0]); @@ -3899,7 +2962,7 @@ static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) unsigned num_components; num_components = - ctx->shader_info->gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; @@ -3919,13 +2982,8 @@ static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second entry of struct */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - LLVMBuildStore(builder, i8_0, tmp); + LLVMBuildStore(builder, i8_0, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); ac_build_endloop(&ctx->ac, 5100); } @@ -3935,7 +2993,7 @@ static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) unsigned num_components; num_components = - ctx->shader_info->gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; @@ -3967,23 +3025,18 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); /* Streamout */ - if (ctx->shader_info->so.num_outputs) { + if (ctx->args->shader_info->so.num_outputs) { struct ngg_streamout nggso = {}; nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false); LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid); for (unsigned stream = 0; stream < 4; ++stream) { - if (!ctx->shader_info->gs.num_stream_output_components[stream]) + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) continue; - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), ""); tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); @@ -3999,6 +3052,33 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) build_streamout(ctx, &nggso); } + /* Write shader query data. */ + tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + /* Use a plain GDS atomic to accumulate the number of generated + * primitives. + */ + ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, + tmp, sync_scope); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); + /* TODO: culling */ /* Determine vertex liveness. */ @@ -4019,13 +3099,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) /* Load primitive liveness */ tmp = ngg_gs_vertex_ptr(ctx, primidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* stream 0 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); @@ -4071,7 +3146,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) * there are 4 or more contiguous null primitives in the export * (in the common case of single-dword prim exports). */ - build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + vertlive_scan.result_reduce, num_emit_threads); /* Setup the reverse vertex compaction permutation. We re-use stream 1 * of the primitive liveness flags, relying on the fact that each @@ -4079,14 +3155,9 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) ac_build_ifcc(&ctx->ac, vertlive, 5130); { tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp2, tmp); + LLVMBuildStore(builder, tmp2, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); } ac_build_endif(&ctx->ac, 5130); @@ -4096,22 +3167,14 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); ac_build_ifcc(&ctx->ac, tmp, 5140); { - struct ngg_prim prim = {}; + LLVMValueRef flags; + struct ac_ngg_prim prim = {}; prim.num_vertices = verts_per_prim; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* primflag */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); - prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp, - LLVMConstInt(ctx->ac.i8, 0, false), ""); - prim.swap = LLVMBuildICmp(builder, LLVMIntEQ, - LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->ac.i32, 1, false), ""), - LLVMConstInt(ctx->ac.i32, 1, false), ""); + flags = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); + prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, @@ -4119,7 +3182,25 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) prim.edgeflag[i] = ctx->ac.i1false; } - build_export_prim(ctx, &prim); + /* Geometry shaders output triangle strips, but NGG expects + * triangles. We need to change the vertex order for odd + * triangles to get correct front/back facing by swapping 2 + * vertex indices, but we also have to keep the provoking + * vertex in the same place. + */ + if (verts_per_prim == 3) { + LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); + is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); + + struct ac_ngg_prim in = prim; + prim.index[0] = in.index[0]; + prim.index[1] = LLVMBuildSelect(builder, is_odd, + in.index[2], in.index[1], ""); + prim.index[2] = LLVMBuildSelect(builder, is_odd, + in.index[1], in.index[2], ""); + } + + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 5140); @@ -4127,8 +3208,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); ac_build_ifcc(&ctx->ac, tmp, 5145); { - struct radv_vs_output_info *outinfo = &ctx->shader_info->vs.outinfo; - bool export_view_index = ctx->options->key.has_multiview_view_index; + struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo; + bool export_view_index = ctx->args->options->key.has_multiview_view_index; struct radv_shader_output_values *outputs; unsigned noutput = 0; @@ -4141,21 +3222,15 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) outinfo->pos_exports = 0; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1: source data index */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); unsigned out_idx = 0; - gep_idx[1] = ctx->ac.i32_0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; int length = util_last_bit(output_usage_mask); if (!(ctx->output_mask & (1ull << i))) @@ -4169,8 +3244,7 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) if (!(output_usage_mask & (1 << j))) continue; - gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false); - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); tmp = LLVMBuildLoad(builder, tmp, ""); LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); @@ -4193,14 +3267,15 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) outputs[noutput].slot_name = VARYING_SLOT_LAYER; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; - outputs[noutput].values[0] = ac_to_float(&ctx->ac, ctx->abi.view_index); + outputs[noutput].values[0] = + ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index)); for (unsigned j = 1; j < 4; j++) outputs[noutput].values[j] = ctx->ac.f32_0; noutput++; } radv_llvm_export_vs(ctx, outputs, noutput, outinfo, - ctx->options->key.vs_common_out.export_clip_dists); + ctx->args->options->key.vs_common_out.export_clip_dists); FREE(outputs); } ac_build_endif(&ctx->ac, 5145); @@ -4208,34 +3283,20 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs) { LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; - const LLVMValueRef vertexidx = - LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, skip the write: excessive vertex emissions are not - * supposed to have any effect. - */ - const LLVMValueRef can_emit = - LLVMBuildICmp(builder, LLVMIntULT, vertexidx, - LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - ac_build_ifcc(&ctx->ac, can_emit, 9001); - - tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); - tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); - LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx); unsigned out_idx = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - ctx->shader_info->gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; int length = util_last_bit(output_usage_mask); @@ -4249,20 +3310,21 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_0, /* first entry of struct */ - LLVMConstInt(ctx->ac.i32, out_idx, false), - }; - LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - out_val = ac_to_integer(&ctx->ac, out_val); out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); - LLVMBuildStore(builder, out_val, ptr); + LLVMBuildStore(builder, out_val, + ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); } } - assert(out_idx * 4 <= ctx->shader_info->gs.gsvs_vertex_size); + assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size); + + /* Store the current number of emitted vertices to zero out remaining + * primitive flags in case the geometry shader doesn't emit the maximum + * number of vertices. + */ + tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); /* Determine and store whether this vertex completed a primitive. */ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); @@ -4271,40 +3333,51 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); + /* Since the geometry shader emits triangle strips, we need to + * track which primitive is odd and swap vertex indices to get + * the correct vertex order. + */ + LLVMValueRef is_odd = ctx->ac.i1false; + if (stream == 0 && + si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) { + tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); + is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); + } + tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second struct entry */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - const LLVMValueRef primflagptr = - LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - + /* The per-vertex primitive flag encoding: + * bit 0: whether this vertex finishes a primitive + * bit 1: whether the primitive is odd (if we are emitting triangle strips) + */ tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp, primflagptr); + tmp = LLVMBuildOr(builder, tmp, + LLVMBuildShl(builder, + LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), + ctx->ac.i8_1, ""), ""); + LLVMBuildStore(builder, tmp, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); - - ac_build_endif(&ctx->ac, 9001); } static void write_tess_factors(struct radv_shader_context *ctx) { unsigned stride, outer_comps, inner_comps; - LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5); - LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); + LLVMValueRef tcs_rel_ids = ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids); + LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 8, 5); + LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 0, 8); unsigned tess_inner_index = 0, tess_outer_index; LLVMValueRef lds_base, lds_inner = NULL, lds_outer, byteoffset, buffer; LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; int i; ac_emit_barrier(&ctx->ac, ctx->stage); - switch (ctx->options->key.tcs.primitive_mode) { + switch (ctx->args->options->key.tcs.primitive_mode) { case GL_ISOLINES: stride = 2; outer_comps = 2; @@ -4346,7 +3419,7 @@ write_tess_factors(struct radv_shader_context *ctx) } // LINES reversal - if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) { + if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) { outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer); lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer, ctx->ac.i32_1, ""); @@ -4375,12 +3448,12 @@ write_tess_factors(struct radv_shader_context *ctx) buffer = ctx->hs_ring_tess_factor; - tf_base = ctx->tess_factor_offset; + tf_base = ac_get_arg(&ctx->ac, ctx->args->tess_factor_offset); byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, false), ""); unsigned tf_offset = 0; - if (ctx->options->chip_class <= GFX8) { + if (ctx->ac.chip_class <= GFX8) { ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504); @@ -4389,7 +3462,7 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, false), 1, ctx->ac.i32_0, tf_base, - 0, ac_glc, false); + 0, ac_glc); tf_offset += 4; ac_build_endif(&ctx->ac, 6504); @@ -4398,14 +3471,14 @@ write_tess_factors(struct radv_shader_context *ctx) /* Store the tessellation factors. */ ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, - tf_offset, ac_glc, false); + tf_offset, ac_glc); if (vec1) ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, - 16 + tf_offset, ac_glc, false); + 16 + tf_offset, ac_glc); //store to offchip for TES to read - only if TES reads them - if (ctx->options->key.tcs.tes_reads_tess_factors) { + if (ctx->args->options->key.tcs.tes_reads_tess_factors) { LLVMValueRef inner_vec, outer_vec, tf_outer_offset; LLVMValueRef tf_inner_offset; unsigned param_outer, param_inner; @@ -4419,7 +3492,8 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec, outer_comps, tf_outer_offset, - ctx->oc_lds, 0, ac_glc, false); + ac_get_arg(&ctx->ac, ctx->args->oc_lds), + 0, ac_glc); if (inner_comps) { param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER); tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL, @@ -4429,7 +3503,8 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_gather_values(&ctx->ac, inner, inner_comps); ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec, inner_comps, tf_inner_offset, - ctx->oc_lds, 0, ac_glc, false); + ac_get_arg(&ctx->ac, ctx->args->oc_lds), + 0, ac_glc); } } @@ -4496,15 +3571,15 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) } /* Process depth, stencil, samplemask. */ - if (ctx->shader_info->ps.writes_z) { + if (ctx->args->shader_info->ps.writes_z) { depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0)); } - if (ctx->shader_info->ps.writes_stencil) { + if (ctx->args->shader_info->ps.writes_stencil) { stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0)); } - if (ctx->shader_info->ps.writes_sample_mask) { + if (ctx->args->shader_info->ps.writes_sample_mask) { samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0)); } @@ -4513,9 +3588,9 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) * exported. */ if (index > 0 && - !ctx->shader_info->ps.writes_z && - !ctx->shader_info->ps.writes_stencil && - !ctx->shader_info->ps.writes_sample_mask) { + !ctx->args->shader_info->ps.writes_z && + !ctx->args->shader_info->ps.writes_stencil && + !ctx->args->shader_info->ps.writes_sample_mask) { unsigned last = index - 1; color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */ @@ -4535,7 +3610,7 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) static void emit_gs_epilogue(struct radv_shader_context *ctx) { - if (ctx->options->key.vs_common_out.as_ngg) { + if (ctx->args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_epilogue_1(ctx); return; } @@ -4554,16 +3629,16 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, switch (ctx->stage) { case MESA_SHADER_VERTEX: - if (ctx->options->key.vs_common_out.as_ls) + if (ctx->args->options->key.vs_common_out.as_ls) handle_ls_outputs_post(ctx); - else if (ctx->options->key.vs_common_out.as_es) - handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info); - else if (ctx->options->key.vs_common_out.as_ngg) + else if (ctx->args->options->key.vs_common_out.as_es) + handle_es_outputs_post(ctx, &ctx->args->shader_info->vs.es_info); + else if (ctx->args->options->key.vs_common_out.as_ngg) handle_ngg_outputs_post_1(ctx); else - handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, - ctx->options->key.vs_common_out.export_clip_dists, - &ctx->shader_info->vs.outinfo); + handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id, + ctx->args->options->key.vs_common_out.export_clip_dists, + &ctx->args->shader_info->vs.outinfo); break; case MESA_SHADER_FRAGMENT: handle_fs_outputs_post(ctx); @@ -4575,14 +3650,14 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, handle_tcs_outputs_post(ctx); break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.vs_common_out.as_es) - handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info); - else if (ctx->options->key.vs_common_out.as_ngg) + if (ctx->args->options->key.vs_common_out.as_es) + handle_es_outputs_post(ctx, &ctx->args->shader_info->tes.es_info); + else if (ctx->args->options->key.vs_common_out.as_ngg) handle_ngg_outputs_post_1(ctx); else - handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, - ctx->options->key.vs_common_out.export_clip_dists, - &ctx->shader_info->tes.outinfo); + handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id, + ctx->args->options->key.vs_common_out.export_clip_dists, + &ctx->args->shader_info->tes.outinfo); break; default: break; @@ -4611,15 +3686,15 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx) case MESA_SHADER_GEOMETRY: return; case MESA_SHADER_VERTEX: - if (ctx->options->key.vs_common_out.as_ls || - ctx->options->key.vs_common_out.as_es) + if (ctx->args->options->key.vs_common_out.as_ls || + ctx->args->options->key.vs_common_out.as_es) return; - outinfo = &ctx->shader_info->vs.outinfo; + outinfo = &ctx->args->shader_info->vs.outinfo; break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.vs_common_out.as_es) + if (ctx->args->options->key.vs_common_out.as_es) return; - outinfo = &ctx->shader_info->tes.outinfo; + outinfo = &ctx->args->shader_info->tes.outinfo; break; default: unreachable("Unhandled shader type"); @@ -4628,16 +3703,16 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx) ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset, - VARYING_SLOT_MAX, + VARYING_SLOT_MAX, 0, &outinfo->param_exports); } static void ac_setup_rings(struct radv_shader_context *ctx) { - if (ctx->options->chip_class <= GFX8 && + if (ctx->args->options->chip_class <= GFX8 && (ctx->stage == MESA_SHADER_GEOMETRY || - ctx->options->key.vs_common_out.as_es || ctx->options->key.vs_common_out.as_es)) { + ctx->args->options->key.vs_common_out.as_es || ctx->args->options->key.vs_common_out.as_es)) { unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS; LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false); @@ -4647,7 +3722,7 @@ ac_setup_rings(struct radv_shader_context *ctx) offset); } - if (ctx->is_gs_copy_shader) { + if (ctx->args->is_gs_copy_shader) { ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, @@ -4678,7 +3753,7 @@ ac_setup_rings(struct radv_shader_context *ctx) LLVMValueRef ring, tmp; num_components = - ctx->shader_info->gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; @@ -4742,22 +3817,40 @@ radv_nir_get_max_workgroup_size(enum chip_class chip_class, /* Fixup the HW not emitting the TCS regs if there are no HS threads. */ static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx) { - LLVMValueRef count = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); + LLVMValueRef count = + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8); LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, ""); - ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, ""); - ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, ""); - ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, ""); + ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->rel_auto_id), + ctx->abi.instance_id, ""); + ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids), + ctx->rel_auto_id, + ""); + ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id), + ctx->abi.vertex_id, ""); } -static void prepare_gs_input_vgprs(struct radv_shader_context *ctx) +static void prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged) { - for(int i = 5; i >= 0; --i) { - ctx->gs_vtx_offset[i] = ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[i & ~1], - (i & 1) * 16, 16); - } + if (merged) { + for(int i = 5; i >= 0; --i) { + ctx->gs_vtx_offset[i] = + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i & ~1]), + (i & 1) * 16, 16); + } - ctx->gs_wave_id = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 16, 8); + ctx->gs_wave_id = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), + 16, 8); + } else { + for (int i = 0; i < 6; i++) + ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i]); + ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->gs_wave_id); + } } /* Ensure that the esgs ring is declared. @@ -4784,62 +3877,62 @@ static LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders, int shader_count, - struct radv_shader_info *shader_info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_args *args) { struct radv_shader_context ctx = {0}; - unsigned i; - ctx.options = options; - ctx.shader_info = shader_info; + ctx.args = args; enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT; - if (shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) { + if (args->shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) { float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO; } - ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, shader_info->wave_size, 64); + ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, + args->options->family, float_mode, + args->shader_info->wave_size, + args->shader_info->ballot_bit_size); ctx.context = ctx.ac.context; - for (i = 0; i < MAX_SETS; i++) - shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; - for (i = 0; i < AC_UD_MAX_UD; i++) - shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1; - ctx.max_workgroup_size = 0; for (int i = 0; i < shader_count; ++i) { ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size, - radv_nir_get_max_workgroup_size(ctx.options->chip_class, + radv_nir_get_max_workgroup_size(args->options->chip_class, shaders[i]->info.stage, shaders[i])); } if (ctx.ac.chip_class >= GFX10) { if (is_pre_gs_stage(shaders[0]->info.stage) && - options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { ctx.max_workgroup_size = 128; } } - create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2, - shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX); + create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2); ctx.abi.inputs = &ctx.inputs[0]; ctx.abi.emit_outputs = handle_shader_outputs_post; - ctx.abi.emit_vertex = visit_emit_vertex; + ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter; ctx.abi.load_ubo = radv_load_ubo; ctx.abi.load_ssbo = radv_load_ssbo; ctx.abi.load_sampler_desc = radv_get_sampler_desc; ctx.abi.load_resource = radv_load_resource; ctx.abi.clamp_shadow_reference = false; - ctx.abi.robust_buffer_access = options->robust_buffer_access; + ctx.abi.robust_buffer_access = args->options->robust_buffer_access; - bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && ctx.options->key.vs_common_out.as_ngg; + bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg; if (shader_count >= 2 || is_ngg) ac_init_exec_full_mask(&ctx.ac); - if (options->has_ls_vgpr_init_bug && + if (args->ac.vertex_id.used) + ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id); + if (args->rel_auto_id.used) + ctx.rel_auto_id = ac_get_arg(&ctx.ac, args->rel_auto_id); + if (args->ac.instance_id.used) + ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id); + + if (args->options->has_ls_vgpr_init_bug && shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) ac_nir_fixup_ls_hs_input_vgprs(&ctx); @@ -4851,16 +3944,19 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, * Add an extra dword per vertex to ensure an odd stride, which * avoids bank conflicts for SoA accesses. */ - declare_esgs_ring(&ctx); + if (!args->options->key.vs_common_out.as_ngg_passthrough) + declare_esgs_ring(&ctx); /* This is really only needed when streamout and / or vertex * compaction is enabled. */ - LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8); - ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, - asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32)); - LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + if (args->shader_info->so.num_outputs) { + LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8); + ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, + asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32)); + LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + } } for(int i = 0; i < shader_count; ++i) { @@ -4873,7 +3969,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); } - if (ctx.options->key.vs_common_out.as_ngg) { + if (args->options->key.vs_common_out.as_ngg) { for (unsigned i = 0; i < 4; ++i) { ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); @@ -4882,7 +3978,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, } unsigned scratch_size = 8; - if (ctx.shader_info->so.num_outputs) + if (args->shader_info->so.num_outputs) scratch_size = 44; LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size); @@ -4905,26 +4001,36 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.abi.load_patch_vertices_in = load_patch_vertices_in; ctx.abi.store_tcs_outputs = store_tcs_output; if (shader_count == 1) - ctx.tcs_num_inputs = ctx.options->key.tcs.num_inputs; + ctx.tcs_num_inputs = args->options->key.tcs.num_inputs; else - ctx.tcs_num_inputs = util_last_bit64(shader_info->vs.ls_outputs_written); - ctx.tcs_num_patches = get_tcs_num_patches(&ctx); + ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written); + unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written); + unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written); + ctx.tcs_num_patches = + get_tcs_num_patches( + ctx.args->options->key.tcs.input_vertices, + ctx.shader->info.tess.tcs_vertices_out, + ctx.tcs_num_inputs, + tcs_num_outputs, + tcs_num_patch_outputs, + ctx.args->options->tess_offchip_block_dw_size, + ctx.args->options->chip_class, + ctx.args->options->family); } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) { ctx.abi.load_tess_varyings = load_tes_input; ctx.abi.load_tess_coord = load_tess_coord; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; - ctx.tcs_num_patches = ctx.options->key.tes.num_patches; + ctx.tcs_num_patches = args->options->key.tes.num_patches; } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) { ctx.abi.load_base_vertex = radv_load_base_vertex; } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) { ctx.abi.load_sample_position = load_sample_position; ctx.abi.load_sample_mask_in = load_sample_mask_in; - ctx.abi.emit_kill = radv_emit_kill; } if (shaders[i]->info.stage == MESA_SHADER_VERTEX && - ctx.options->key.vs_common_out.as_ngg && - ctx.options->key.vs_common_out.export_prim_id) { + args->options->key.vs_common_out.as_ngg && + args->options->key.vs_common_out.export_prim_id) { declare_esgs_ring(&ctx); } @@ -4932,7 +4038,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, if (i) { if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && - ctx.options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_prologue(&ctx); nested_barrier = false; } else { @@ -4966,13 +4072,16 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_setup_rings(&ctx); - LLVMBasicBlockRef merge_block; + LLVMBasicBlockRef merge_block = NULL; if (shader_count >= 2 || is_ngg) { LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); - LLVMValueRef count = ac_unpack_param(&ctx.ac, ctx.merged_wave_info, 8 * i, 8); + LLVMValueRef count = + ac_unpack_param(&ctx.ac, + ac_get_arg(&ctx.ac, args->merged_wave_info), + 8 * i, 8); LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac); LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, ""); @@ -4985,10 +4094,10 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, prepare_interp_optimize(&ctx, shaders[i]); else if(shaders[i]->info.stage == MESA_SHADER_VERTEX) handle_vs_inputs(&ctx, shaders[i]); - else if(shader_count >= 2 && shaders[i]->info.stage == MESA_SHADER_GEOMETRY) - prepare_gs_input_vgprs(&ctx); + else if(shaders[i]->info.stage == MESA_SHADER_GEOMETRY) + prepare_gs_input_vgprs(&ctx, shader_count >= 2); - ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]); + ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[i]); if (shader_count >= 2 || is_ngg) { LLVMBuildBr(ctx.ac.builder, merge_block); @@ -4998,37 +4107,46 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, /* This needs to be outside the if wrapping the shader body, as sometimes * the HW generates waves with 0 es/vs threads. */ if (is_pre_gs_stage(shaders[i]->info.stage) && - ctx.options->key.vs_common_out.as_ngg && + args->options->key.vs_common_out.as_ngg && i == shader_count - 1) { handle_ngg_outputs_post_2(&ctx); } else if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && - ctx.options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_epilogue_2(&ctx); } if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { - shader_info->tcs.num_patches = ctx.tcs_num_patches; - shader_info->tcs.lds_size = calculate_tess_lds_size(&ctx); + unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written); + unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written); + args->shader_info->tcs.num_patches = ctx.tcs_num_patches; + args->shader_info->tcs.lds_size = + calculate_tess_lds_size( + ctx.args->options->key.tcs.input_vertices, + ctx.shader->info.tess.tcs_vertices_out, + ctx.tcs_num_inputs, + ctx.tcs_num_patches, + tcs_num_outputs, + tcs_num_patch_outputs); } } LLVMBuildRetVoid(ctx.ac.builder); - if (options->dump_preoptir) { + if (args->options->dump_preoptir) { fprintf(stderr, "%s LLVM IR:\n\n", - radv_get_shader_name(shader_info, + radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage)); ac_dump_module(ctx.ac.module); fprintf(stderr, "\n"); } - ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); if (shader_count == 1) ac_nir_eliminate_const_vs_outputs(&ctx); - if (options->dump_shader) { - ctx.shader_info->private_mem_vgprs = + if (args->options->dump_shader) { + args->shader_info->private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_function); } @@ -5119,31 +4237,29 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, free(elf_buffer); } -void +static void radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary, - struct radv_shader_info *shader_info, + const struct radv_shader_args *args, struct nir_shader *const *nir, - int nir_count, - const struct radv_nir_compiler_options *options) + int nir_count) { LLVMModuleRef llvm_module; - llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, shader_info, - options); + llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args); ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage, - radv_get_shader_name(shader_info, + radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage), - options); + args->options); /* Determine the ES type (VS or TES) for the GS on GFX9. */ - if (options->chip_class >= GFX9) { + if (args->options->chip_class >= GFX9) { if (nir_count == 2 && nir[1]->info.stage == MESA_SHADER_GEOMETRY) { - shader_info->gs.es_type = nir[0]->info.stage; + args->shader_info->gs.es_type = nir[0]->info.stage; } } } @@ -5152,15 +4268,18 @@ static void ac_gs_copy_shader_emit(struct radv_shader_context *ctx) { LLVMValueRef vtx_offset = - LLVMBuildMul(ctx->ac.builder, ctx->abi.vertex_id, + LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id), LLVMConstInt(ctx->ac.i32, 4, false), ""); LLVMValueRef stream_id; /* Fetch the vertex stream ID. */ - if (!ctx->options->use_ngg_streamout && - ctx->shader_info->so.num_outputs) { + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs) { stream_id = - ac_unpack_param(&ctx->ac, ctx->streamout_config, 24, 2); + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, + ctx->args->streamout_config), + 24, 2); } else { stream_id = ctx->ac.i32_0; } @@ -5174,14 +4293,14 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) for (unsigned stream = 0; stream < 4; stream++) { unsigned num_components = - ctx->shader_info->gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; LLVMBasicBlockRef bb; unsigned offset; if (stream > 0 && !num_components) continue; - if (stream > 0 && !ctx->shader_info->so.num_outputs) + if (stream > 0 && !ctx->args->shader_info->so.num_outputs) continue; bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out"); @@ -5191,9 +4310,9 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) offset = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; unsigned output_stream = - ctx->shader_info->gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; int length = util_last_bit(output_usage_mask); if (!(ctx->output_mask & (1ull << i)) || @@ -5229,13 +4348,13 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) } } - if (!ctx->options->use_ngg_streamout && - ctx->shader_info->so.num_outputs) + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs) radv_emit_streamout(ctx, stream); if (stream == 0) { handle_vs_outputs_post(ctx, false, true, - &ctx->shader_info->vs.outinfo); + &ctx->args->shader_info->vs.outinfo); } LLVMBuildBr(ctx->ac.builder, end_bb); @@ -5244,26 +4363,25 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb); } -void +static void radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, struct radv_shader_binary **rbinary, - struct radv_shader_info *shader_info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_args *args) { struct radv_shader_context ctx = {0}; - ctx.options = options; - ctx.shader_info = shader_info; + ctx.args = args; - ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, AC_FLOAT_MODE_DEFAULT, 64, 64); + assert(args->is_gs_copy_shader); + + ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, + args->options->family, AC_FLOAT_MODE_DEFAULT, 64, 64); ctx.context = ctx.ac.context; - ctx.is_gs_copy_shader = true; ctx.stage = MESA_SHADER_VERTEX; ctx.shader = geom_shader; - create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX); + create_function(&ctx, MESA_SHADER_VERTEX, false); ac_setup_rings(&ctx); @@ -5277,10 +4395,43 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, LLVMBuildRetVoid(ctx.ac.builder); - ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, - MESA_SHADER_VERTEX, "GS Copy Shader", options); + MESA_SHADER_VERTEX, "GS Copy Shader", args->options); (*rbinary)->is_gs_copy_shader = true; } + +void +llvm_compile_shader(struct radv_device *device, + unsigned shader_count, + struct nir_shader *const *shaders, + struct radv_shader_binary **binary, + struct radv_shader_args *args) +{ + enum ac_target_machine_options tm_options = 0; + struct ac_llvm_compiler ac_llvm; + bool thread_compiler; + + tm_options |= AC_TM_SUPPORTS_SPILL; + if (args->options->check_ir) + tm_options |= AC_TM_CHECK_IR; + if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) + tm_options |= AC_TM_NO_LOAD_STORE_OPT; + + thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); + + radv_init_llvm_compiler(&ac_llvm, thread_compiler, + args->options->family, tm_options, + args->shader_info->wave_size); + + if (args->is_gs_copy_shader) { + radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args); + } else { + radv_compile_nir_shader(&ac_llvm, binary, args, + shaders, shader_count); + } + + radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); +}