X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_nir_to_llvm.c;h=51416262df19edb2f7b2f1834cb47b9dcd065376;hb=2a6811f0f981c8d67d0131a0b74549b641ea2247;hp=1b77bd70eec227b1c6ba2d1641820b1bf4281c33;hpb=0bf51b6941b8ad4e122772525c2caf6896bfc0cb;p=mesa.git diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 1b77bd70eec..51416262df1 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -28,13 +28,10 @@ #include "radv_private.h" #include "radv_shader.h" #include "radv_shader_helper.h" +#include "radv_shader_args.h" +#include "radv_debug.h" #include "nir/nir.h" -#include -#include -#include -#include - #include "sid.h" #include "ac_binary.h" #include "ac_llvm_util.h" @@ -47,38 +44,22 @@ struct radv_shader_context { struct ac_llvm_context ac; - const struct radv_nir_compiler_options *options; - struct radv_shader_info *shader_info; const struct nir_shader *shader; struct ac_shader_abi abi; + const struct radv_shader_args *args; + + gl_shader_stage stage; unsigned max_workgroup_size; LLVMContextRef context; LLVMValueRef main_function; LLVMValueRef descriptor_sets[MAX_SETS]; + LLVMValueRef ring_offsets; - LLVMValueRef vertex_buffers; LLVMValueRef rel_auto_id; - LLVMValueRef vs_prim_id; - LLVMValueRef es2gs_offset; - - LLVMValueRef oc_lds; - LLVMValueRef merged_wave_info; - LLVMValueRef tess_factor_offset; - LLVMValueRef tes_rel_patch_id; - LLVMValueRef tes_u; - LLVMValueRef tes_v; - - /* HW GS */ - /* On gfx10: - * - bits 0..10: ordered_wave_id - * - bits 12..20: number of vertices in group - * - bits 22..30: number of primitives in group - */ - LLVMValueRef gs_tg_info; - LLVMValueRef gs2vs_offset; + LLVMValueRef gs_wave_id; LLVMValueRef gs_vtx_offset[6]; @@ -87,19 +68,10 @@ struct radv_shader_context { LLVMValueRef hs_ring_tess_offchip; LLVMValueRef hs_ring_tess_factor; - /* Streamout */ - LLVMValueRef streamout_buffers; - LLVMValueRef streamout_write_idx; - LLVMValueRef streamout_config; - LLVMValueRef streamout_offset[4]; - - gl_shader_stage stage; - LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; uint64_t output_mask; - bool is_gs_copy_shader; LLVMValueRef gs_next_vertex[4]; LLVMValueRef gs_curprim_verts[4]; LLVMValueRef gs_generated_prims[4]; @@ -119,14 +91,6 @@ struct radv_shader_output_values { unsigned usage_mask; }; -enum radeon_llvm_calling_convention { - RADEON_LLVM_AMDGPU_VS = 87, - RADEON_LLVM_AMDGPU_GS = 88, - RADEON_LLVM_AMDGPU_PS = 89, - RADEON_LLVM_AMDGPU_CS = 90, - RADEON_LLVM_AMDGPU_HS = 93, -}; - static inline struct radv_shader_context * radv_shader_context_from_abi(struct ac_shader_abi *abi) { @@ -138,96 +102,17 @@ static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx) { switch (ctx->stage) { case MESA_SHADER_TESS_CTRL: - return ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); + return ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids), + 0, 8); case MESA_SHADER_TESS_EVAL: - return ctx->tes_rel_patch_id; + return ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id); break; default: unreachable("Illegal stage"); } } -static unsigned -get_tcs_num_patches(struct radv_shader_context *ctx) -{ - unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; - uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); - uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; - uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - unsigned num_patches; - unsigned hardware_lds_size; - - /* Ensure that we only need one wave per SIMD so we don't need to check - * resource usage. Also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256. - */ - num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4; - /* Make sure that the data fits in LDS. This assumes the shaders only - * use LDS for the inputs and outputs. - */ - hardware_lds_size = 32768; - - /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single - * threadgroup, even though there is more than 32 KiB LDS. - * - * Test: dEQP-VK.tessellation.shader_input_output.barrier - */ - if (ctx->options->chip_class >= GFX7 && ctx->options->family != CHIP_STONEY) - hardware_lds_size = 65536; - - num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); - /* Make sure the output data fits in the offchip buffer */ - num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size); - /* Not necessary for correctness, but improves performance. The - * specific value is taken from the proprietary driver. - */ - num_patches = MIN2(num_patches, 40); - - /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ - if (ctx->options->chip_class == GFX6) { - unsigned one_wave = ctx->options->wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp); - num_patches = MIN2(num_patches, one_wave); - } - return num_patches; -} - -static unsigned -calculate_tess_lds_size(struct radv_shader_context *ctx) -{ - unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp; - unsigned num_tcs_outputs, num_tcs_patch_outputs; - unsigned input_vertex_size, output_vertex_size; - unsigned input_patch_size, output_patch_size; - unsigned pervertex_output_patch_size; - unsigned output_patch0_offset; - unsigned num_patches; - unsigned lds_size; - - num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; - num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); - num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); - - input_vertex_size = ctx->tcs_num_inputs * 16; - output_vertex_size = num_tcs_outputs * 16; - - input_patch_size = num_tcs_input_cp * input_vertex_size; - - pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; - output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - - num_patches = ctx->tcs_num_patches; - output_patch0_offset = input_patch_size * num_patches; - - lds_size = output_patch0_offset + output_patch_size * num_patches; - return lds_size; -} - /* Tessellation shaders pass outputs to the next shader using LDS. * * LS outputs = TCS inputs @@ -251,9 +136,9 @@ calculate_tess_lds_size(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_in_patch_stride(struct radv_shader_context *ctx) { - assert (ctx->stage == MESA_SHADER_TESS_CTRL); + assert(ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; input_patch_size /= 4; return LLVMConstInt(ctx->ac.i32, input_patch_size, false); @@ -262,8 +147,8 @@ get_tcs_in_patch_stride(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_out_patch_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->tcs.patch_outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; @@ -274,7 +159,7 @@ get_tcs_out_patch_stride(struct radv_shader_context *ctx) static LLVMValueRef get_tcs_out_vertex_stride(struct radv_shader_context *ctx) { - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; output_vertex_size /= 4; return LLVMConstInt(ctx->ac.i32, output_vertex_size, false); @@ -285,7 +170,7 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx) { assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; unsigned num_patches = ctx->tcs_num_patches; @@ -299,10 +184,10 @@ get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx) { assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; unsigned num_patches = ctx->tcs_num_patches; @@ -345,87 +230,16 @@ get_tcs_out_current_patch_data_offset(struct radv_shader_context *ctx) patch0_patch_data_offset); } -#define MAX_ARGS 64 -struct arg_info { - LLVMTypeRef types[MAX_ARGS]; - LLVMValueRef *assign[MAX_ARGS]; - uint8_t count; - uint8_t sgpr_count; - uint8_t num_sgprs_used; - uint8_t num_vgprs_used; -}; - -enum ac_arg_regfile { - ARG_SGPR, - ARG_VGPR, -}; - -static void -add_arg(struct arg_info *info, enum ac_arg_regfile regfile, LLVMTypeRef type, - LLVMValueRef *param_ptr) -{ - assert(info->count < MAX_ARGS); - - info->assign[info->count] = param_ptr; - info->types[info->count] = type; - info->count++; - - if (regfile == ARG_SGPR) { - info->num_sgprs_used += ac_get_type_size(type) / 4; - info->sgpr_count++; - } else { - assert(regfile == ARG_VGPR); - info->num_vgprs_used += ac_get_type_size(type) / 4; - } -} - -static void assign_arguments(LLVMValueRef main_function, - struct arg_info *info) -{ - unsigned i; - for (i = 0; i < info->count; i++) { - if (info->assign[i]) - *info->assign[i] = LLVMGetParam(main_function, i); - } -} - static LLVMValueRef -create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, - LLVMBuilderRef builder, LLVMTypeRef *return_types, - unsigned num_return_elems, - struct arg_info *args, +create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, + LLVMBuilderRef builder, + const struct ac_shader_args *args, + enum ac_llvm_calling_convention convention, unsigned max_workgroup_size, const struct radv_nir_compiler_options *options) { - LLVMTypeRef main_function_type, ret_type; - LLVMBasicBlockRef main_function_body; - - if (num_return_elems) - ret_type = LLVMStructTypeInContext(ctx, return_types, - num_return_elems, true); - else - ret_type = LLVMVoidTypeInContext(ctx); - - /* Setup the function */ - main_function_type = - LLVMFunctionType(ret_type, args->types, args->count, 0); LLVMValueRef main_function = - LLVMAddFunction(module, "main", main_function_type); - main_function_body = - LLVMAppendBasicBlockInContext(ctx, main_function, "main_body"); - LLVMPositionBuilderAtEnd(builder, main_function_body); - - LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS); - for (unsigned i = 0; i < args->sgpr_count; ++i) { - LLVMValueRef P = LLVMGetParam(main_function, i); - - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG); - - if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { - ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); - ac_add_attr_dereferenceable(P, UINT64_MAX); - } - } + ac_build_main(args, ctx, convention, "main", ctx->voidt, module); if (options->address32_hi) { ac_llvm_add_target_dep_function_attr(main_function, @@ -435,469 +249,57 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, ac_llvm_set_workgroup_size(main_function, max_workgroup_size); - if (options->unsafe_math) { - /* These were copied from some LLVM test. */ - LLVMAddTargetDependentFunctionAttr(main_function, - "less-precise-fpmad", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "no-infs-fp-math", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "no-nans-fp-math", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "unsafe-fp-math", - "true"); - LLVMAddTargetDependentFunctionAttr(main_function, - "no-signed-zeros-fp-math", - "true"); - } return main_function; } - -static void -set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, - uint8_t num_sgprs) -{ - ud_info->sgpr_idx = *sgpr_idx; - ud_info->num_sgprs = num_sgprs; - *sgpr_idx += num_sgprs; -} - -static void -set_loc_shader(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx, - uint8_t num_sgprs) -{ - struct radv_userdata_info *ud_info = - &ctx->shader_info->user_sgprs_locs.shader_data[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, num_sgprs); -} - -static void -set_loc_shader_ptr(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) -{ - bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; - - set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); -} - -static void -set_loc_desc(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) -{ - struct radv_userdata_locations *locs = - &ctx->shader_info->user_sgprs_locs; - struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx]; - assert(ud_info); - - set_loc(ud_info, sgpr_idx, 1); - - locs->descriptor_sets_enabled |= 1 << idx; -} - -struct user_sgpr_info { - bool need_ring_offsets; - bool indirect_all_descriptor_sets; - uint8_t remaining_sgprs; -}; - -static bool needs_view_index_sgpr(struct radv_shader_context *ctx, - gl_shader_stage stage) -{ - switch (stage) { - case MESA_SHADER_VERTEX: - if (ctx->shader_info->needs_multiview_view_index || - (!ctx->options->key.vs_common_out.as_es && !ctx->options->key.vs_common_out.as_ls && ctx->options->key.has_multiview_view_index)) - return true; - break; - case MESA_SHADER_TESS_EVAL: - if (ctx->shader_info->needs_multiview_view_index || (!ctx->options->key.vs_common_out.as_es && ctx->options->key.has_multiview_view_index)) - return true; - break; - case MESA_SHADER_GEOMETRY: - case MESA_SHADER_TESS_CTRL: - if (ctx->shader_info->needs_multiview_view_index) - return true; - break; - default: - break; - } - return false; -} - -static uint8_t -count_vs_user_sgprs(struct radv_shader_context *ctx) -{ - uint8_t count = 0; - - if (ctx->shader_info->vs.has_vertex_buffers) - count++; - count += ctx->shader_info->vs.needs_draw_id ? 3 : 2; - - return count; -} - -static void allocate_inline_push_consts(struct radv_shader_context *ctx, - struct user_sgpr_info *user_sgpr_info) -{ - uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; - - /* Only supported if shaders use push constants. */ - if (ctx->shader_info->min_push_constant_used == UINT8_MAX) - return; - - /* Only supported if shaders don't have indirect push constants. */ - if (ctx->shader_info->has_indirect_push_constants) - return; - - /* Only supported for 32-bit push constants. */ - if (!ctx->shader_info->has_only_32bit_push_constants) - return; - - uint8_t num_push_consts = - (ctx->shader_info->max_push_constant_used - - ctx->shader_info->min_push_constant_used) / 4; - - /* Check if the number of user SGPRs is large enough. */ - if (num_push_consts < remaining_sgprs) { - ctx->shader_info->num_inline_push_consts = num_push_consts; - } else { - ctx->shader_info->num_inline_push_consts = remaining_sgprs; - } - - /* Clamp to the maximum number of allowed inlined push constants. */ - if (ctx->shader_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) - ctx->shader_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; - - if (ctx->shader_info->num_inline_push_consts == num_push_consts && - !ctx->shader_info->loads_dynamic_offsets) { - /* Disable the default push constants path if all constants are - * inlined and if shaders don't use dynamic descriptors. - */ - ctx->shader_info->loads_push_constants = false; - } - - ctx->shader_info->base_inline_push_consts = - ctx->shader_info->min_push_constant_used / 4; -} - -static void allocate_user_sgprs(struct radv_shader_context *ctx, - gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage, - bool needs_view_index, - struct user_sgpr_info *user_sgpr_info) -{ - uint8_t user_sgpr_count = 0; - - memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info)); - - /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */ - if (stage == MESA_SHADER_GEOMETRY || - stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_CTRL || - stage == MESA_SHADER_TESS_EVAL || - ctx->is_gs_copy_shader) - user_sgpr_info->need_ring_offsets = true; - - if (stage == MESA_SHADER_FRAGMENT && - ctx->shader_info->ps.needs_sample_positions) - user_sgpr_info->need_ring_offsets = true; - - /* 2 user sgprs will nearly always be allocated for scratch/rings */ - if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) { - user_sgpr_count += 2; - } - - switch (stage) { - case MESA_SHADER_COMPUTE: - if (ctx->shader_info->cs.uses_grid_size) - user_sgpr_count += 3; - break; - case MESA_SHADER_FRAGMENT: - user_sgpr_count += ctx->shader_info->ps.needs_sample_positions; - break; - case MESA_SHADER_VERTEX: - if (!ctx->is_gs_copy_shader) - user_sgpr_count += count_vs_user_sgprs(ctx); - break; - case MESA_SHADER_TESS_CTRL: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) - user_sgpr_count += count_vs_user_sgprs(ctx); - } - break; - case MESA_SHADER_TESS_EVAL: - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) { - user_sgpr_count += count_vs_user_sgprs(ctx); - } - } - break; - default: - break; - } - - if (needs_view_index) - user_sgpr_count++; - - if (ctx->shader_info->loads_push_constants) - user_sgpr_count++; - - if (ctx->streamout_buffers) - user_sgpr_count++; - - uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; - uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; - uint32_t num_desc_set = - util_bitcount(ctx->shader_info->desc_set_used_mask); - - if (remaining_sgprs < num_desc_set) { - user_sgpr_info->indirect_all_descriptor_sets = true; - user_sgpr_info->remaining_sgprs = remaining_sgprs - 1; - } else { - user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set; - } - - allocate_inline_push_consts(ctx, user_sgpr_info); -} - static void -declare_global_input_sgprs(struct radv_shader_context *ctx, - const struct user_sgpr_info *user_sgpr_info, - struct arg_info *args, - LLVMValueRef *desc_sets) +load_descriptor_sets(struct radv_shader_context *ctx) { - LLVMTypeRef type = ac_array_in_const32_addr_space(ctx->ac.i8); - - /* 1 for each descriptor set */ - if (!user_sgpr_info->indirect_all_descriptor_sets) { - uint32_t mask = ctx->shader_info->desc_set_used_mask; - + uint32_t mask = ctx->args->shader_info->desc_set_used_mask; + if (ctx->args->shader_info->need_indirect_descriptor_sets) { + LLVMValueRef desc_sets = + ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]); while (mask) { int i = u_bit_scan(&mask); - add_arg(args, ARG_SGPR, type, &ctx->descriptor_sets[i]); - } - } else { - add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(type), - desc_sets); - } - - if (ctx->shader_info->loads_push_constants) { - /* 1 for push constants and dynamic descriptors */ - add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants); - } - - for (unsigned i = 0; i < ctx->shader_info->num_inline_push_consts; i++) { - add_arg(args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.inline_push_consts[i]); - } - ctx->abi.num_inline_push_consts = ctx->shader_info->num_inline_push_consts; - ctx->abi.base_inline_push_consts = ctx->shader_info->base_inline_push_consts; - - if (ctx->shader_info->so.num_outputs) { - add_arg(args, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->ac.v4i32), - &ctx->streamout_buffers); - } -} - -static void -declare_vs_specific_input_sgprs(struct radv_shader_context *ctx, - gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage, - struct arg_info *args) -{ - if (!ctx->is_gs_copy_shader && - (stage == MESA_SHADER_VERTEX || - (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->vs.has_vertex_buffers) { - add_arg(args, ARG_SGPR, - ac_array_in_const32_addr_space(ctx->ac.v4i32), - &ctx->vertex_buffers); - } - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex); - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance); - if (ctx->shader_info->vs.needs_draw_id) { - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.draw_id); - } - } -} - -static void -declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) -{ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id); - if (!ctx->is_gs_copy_shader) { - if (ctx->options->key.vs_common_out.as_ls) { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id); - if (ctx->ac.chip_class >= GFX10) { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - } - } else { - if (ctx->ac.chip_class >= GFX10) { - if (ctx->options->key.vs_common_out.as_ngg) { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* user vgpr */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - } - } else { - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */ - } - } - } -} - -static void -declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, - struct arg_info *args) -{ - int i; - - /* Streamout SGPRs. */ - if (ctx->shader_info->so.num_outputs) { - assert(stage == MESA_SHADER_VERTEX || - stage == MESA_SHADER_TESS_EVAL); - - if (stage != MESA_SHADER_TESS_EVAL) { - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_config); - } else { - args->assign[args->count - 1] = &ctx->streamout_config; - args->types[args->count - 1] = ctx->ac.i32; - } - - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_write_idx); - } - - /* A streamout buffer offset is loaded if the stride is non-zero. */ - for (i = 0; i < 4; i++) { - if (!ctx->shader_info->so.strides[i]) - continue; - - add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_offset[i]); - } -} - -static void -declare_tes_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) -{ - add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_u); - add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_v); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_rel_patch_id); - add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id); -} - -static void -set_global_input_locs(struct radv_shader_context *ctx, - const struct user_sgpr_info *user_sgpr_info, - LLVMValueRef desc_sets, uint8_t *user_sgpr_idx) -{ - uint32_t mask = ctx->shader_info->desc_set_used_mask; - - if (!user_sgpr_info->indirect_all_descriptor_sets) { - while (mask) { - int i = u_bit_scan(&mask); + ctx->descriptor_sets[i] = + ac_build_load_to_sgpr(&ctx->ac, desc_sets, + LLVMConstInt(ctx->ac.i32, i, false)); - set_loc_desc(ctx, i, user_sgpr_idx); } } else { - set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, - user_sgpr_idx); - while (mask) { int i = u_bit_scan(&mask); ctx->descriptor_sets[i] = - ac_build_load_to_sgpr(&ctx->ac, desc_sets, - LLVMConstInt(ctx->ac.i32, i, false)); - - } - - ctx->shader_info->need_indirect_descriptor_sets = true; - } - - if (ctx->shader_info->loads_push_constants) { - set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); - } - - if (ctx->shader_info->num_inline_push_consts) { - set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, - ctx->shader_info->num_inline_push_consts); - } - - if (ctx->streamout_buffers) { - set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, - user_sgpr_idx); - } -} - -static void -set_vs_specific_input_locs(struct radv_shader_context *ctx, - gl_shader_stage stage, bool has_previous_stage, - gl_shader_stage previous_stage, - uint8_t *user_sgpr_idx) -{ - if (!ctx->is_gs_copy_shader && - (stage == MESA_SHADER_VERTEX || - (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) { - if (ctx->shader_info->vs.has_vertex_buffers) { - set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, - user_sgpr_idx); + ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[i]); } - - unsigned vs_num = 2; - if (ctx->shader_info->vs.needs_draw_id) - vs_num++; - - set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, - user_sgpr_idx, vs_num); } } -static void set_llvm_calling_convention(LLVMValueRef func, - gl_shader_stage stage) +static enum ac_llvm_calling_convention +get_llvm_calling_convention(LLVMValueRef func, gl_shader_stage stage) { - enum radeon_llvm_calling_convention calling_conv; - switch (stage) { case MESA_SHADER_VERTEX: case MESA_SHADER_TESS_EVAL: - calling_conv = RADEON_LLVM_AMDGPU_VS; + return AC_LLVM_AMDGPU_VS; break; case MESA_SHADER_GEOMETRY: - calling_conv = RADEON_LLVM_AMDGPU_GS; + return AC_LLVM_AMDGPU_GS; break; case MESA_SHADER_TESS_CTRL: - calling_conv = RADEON_LLVM_AMDGPU_HS; + return AC_LLVM_AMDGPU_HS; break; case MESA_SHADER_FRAGMENT: - calling_conv = RADEON_LLVM_AMDGPU_PS; + return AC_LLVM_AMDGPU_PS; break; case MESA_SHADER_COMPUTE: - calling_conv = RADEON_LLVM_AMDGPU_CS; + return AC_LLVM_AMDGPU_CS; break; default: unreachable("Unhandle shader type"); } - - LLVMSetFunctionCallConv(func, calling_conv); } /* Returns whether the stage is a stage that can be directly before the GS */ @@ -908,331 +310,37 @@ static bool is_pre_gs_stage(gl_shader_stage stage) static void create_function(struct radv_shader_context *ctx, gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage) + bool has_previous_stage) { - uint8_t user_sgpr_idx; - struct user_sgpr_info user_sgpr_info; - struct arg_info args = {}; - LLVMValueRef desc_sets; - bool needs_view_index = needs_view_index_sgpr(ctx, stage); - if (ctx->ac.chip_class >= GFX10) { - if (is_pre_gs_stage(stage) && ctx->options->key.vs_common_out.as_ngg) { + if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) { /* On GFX10, VS is merged into GS for NGG. */ - previous_stage = stage; stage = MESA_SHADER_GEOMETRY; has_previous_stage = true; } } - allocate_user_sgprs(ctx, stage, has_previous_stage, - previous_stage, needs_view_index, &user_sgpr_info); + ctx->main_function = create_llvm_function( + &ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac, + get_llvm_calling_convention(ctx->main_function, stage), + ctx->max_workgroup_size, + ctx->args->options); - if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) { - add_arg(&args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32), - &ctx->ring_offsets); - } + ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", + LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), + NULL, 0, AC_FUNC_ATTR_READNONE); + ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, + ac_array_in_const_addr_space(ctx->ac.v4i32), ""); - switch (stage) { - case MESA_SHADER_COMPUTE: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); + load_descriptor_sets(ctx); - if (ctx->shader_info->cs.uses_grid_size) { - add_arg(&args, ARG_SGPR, ctx->ac.v3i32, - &ctx->abi.num_work_groups); - } + if (stage == MESA_SHADER_TESS_CTRL || + (stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) || + /* GFX9 has the ESGS ring buffer in LDS. */ + (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) { + ac_declare_lds_as_pointer(&ctx->ac); + } - for (int i = 0; i < 3; i++) { - ctx->abi.workgroup_ids[i] = NULL; - if (ctx->shader_info->cs.uses_block_id[i]) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.workgroup_ids[i]); - } - } - - if (ctx->shader_info->cs.uses_local_invocation_idx) - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.tg_size); - add_arg(&args, ARG_VGPR, ctx->ac.v3i32, - &ctx->abi.local_invocation_ids); - break; - case MESA_SHADER_VERTEX: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage, - previous_stage, &args); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - if (ctx->options->key.vs_common_out.as_es) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->es2gs_offset); - } else if (ctx->options->key.vs_common_out.as_ls) { - /* no extra parameters */ - } else { - declare_streamout_sgprs(ctx, stage, &args); - } - - declare_vs_input_vgprs(ctx, &args); - break; - case MESA_SHADER_TESS_CTRL: - if (has_previous_stage) { - // First 6 system regs - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->merged_wave_info); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->tess_factor_offset); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - declare_vs_specific_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, &args); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_patch_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_rel_ids); - - declare_vs_input_vgprs(ctx, &args); - } else { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->tess_factor_offset); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_patch_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.tcs_rel_ids); - } - break; - case MESA_SHADER_TESS_EVAL: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - if (ctx->options->key.vs_common_out.as_es) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->es2gs_offset); - } else { - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); - declare_streamout_sgprs(ctx, stage, &args); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - } - declare_tes_input_vgprs(ctx, &args); - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - // First 6 system regs - if (ctx->options->key.vs_common_out.as_ngg) { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs_tg_info); - } else { - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->gs2vs_offset); - } - - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->merged_wave_info); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (previous_stage != MESA_SHADER_TESS_EVAL) { - declare_vs_specific_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, - &args); - } - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[0]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[2]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_prim_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_invocation_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[4]); - - if (previous_stage == MESA_SHADER_VERTEX) { - declare_vs_input_vgprs(ctx, &args); - } else { - declare_tes_input_vgprs(ctx, &args); - } - } else { - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - if (needs_view_index) - add_arg(&args, ARG_SGPR, ctx->ac.i32, - &ctx->abi.view_index); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs2vs_offset); - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs_wave_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[0]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[1]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_prim_id); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[2]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[3]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[4]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->gs_vtx_offset[5]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, - &ctx->abi.gs_invocation_id); - } - break; - case MESA_SHADER_FRAGMENT: - declare_global_input_sgprs(ctx, &user_sgpr_info, &args, - &desc_sets); - - add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.persp_centroid); - add_arg(&args, ARG_VGPR, ctx->ac.v3i32, NULL); /* persp pull model */ - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_sample); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_center); - add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->abi.linear_centroid); - add_arg(&args, ARG_VGPR, ctx->ac.f32, NULL); /* line stipple tex */ - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[0]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[1]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[2]); - add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[3]); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.front_face); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.ancillary); - add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.sample_coverage); - add_arg(&args, ARG_VGPR, ctx->ac.i32, NULL); /* fixed pt */ - break; - default: - unreachable("Shader stage not implemented"); - } - - ctx->main_function = create_llvm_function( - ctx->context, ctx->ac.module, ctx->ac.builder, NULL, 0, &args, - ctx->max_workgroup_size, ctx->options); - set_llvm_calling_convention(ctx->main_function, stage); - - - ctx->shader_info->num_input_vgprs = 0; - ctx->shader_info->num_input_sgprs = ctx->options->supports_spill ? 2 : 0; - - ctx->shader_info->num_input_sgprs += args.num_sgprs_used; - - if (ctx->stage != MESA_SHADER_FRAGMENT) - ctx->shader_info->num_input_vgprs = args.num_vgprs_used; - - assign_arguments(ctx->main_function, &args); - - user_sgpr_idx = 0; - - if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) { - set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, - &user_sgpr_idx); - if (ctx->options->supports_spill) { - ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", - LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), - NULL, 0, AC_FUNC_ATTR_READNONE); - ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, - ac_array_in_const_addr_space(ctx->ac.v4i32), ""); - } - } - - /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including - * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */ - if (has_previous_stage) - user_sgpr_idx = 0; - - set_global_input_locs(ctx, &user_sgpr_info, desc_sets, &user_sgpr_idx); - - switch (stage) { - case MESA_SHADER_COMPUTE: - if (ctx->shader_info->cs.uses_grid_size) { - set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, - &user_sgpr_idx, 3); - } - break; - case MESA_SHADER_VERTEX: - set_vs_specific_input_locs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_idx); - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_TESS_CTRL: - set_vs_specific_input_locs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_idx); - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_TESS_EVAL: - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_GEOMETRY: - if (has_previous_stage) { - if (previous_stage == MESA_SHADER_VERTEX) - set_vs_specific_input_locs(ctx, stage, - has_previous_stage, - previous_stage, - &user_sgpr_idx); - } - if (ctx->abi.view_index) - set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1); - break; - case MESA_SHADER_FRAGMENT: - break; - default: - unreachable("Shader stage not implemented"); - } - - if (stage == MESA_SHADER_TESS_CTRL || - (stage == MESA_SHADER_VERTEX && ctx->options->key.vs_common_out.as_ls) || - /* GFX9 has the ESGS ring buffer in LDS. */ - (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) { - ac_declare_lds_as_pointer(&ctx->ac); - } - - ctx->shader_info->num_user_sgprs = user_sgpr_idx; } @@ -1242,7 +350,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set]; - struct radv_pipeline_layout *pipeline_layout = ctx->options->layout; + struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout; struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout; unsigned base_offset = layout->binding[binding].offset; LLVMValueRef offset, stride; @@ -1251,7 +359,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; - desc_ptr = ctx->abi.push_constants; + desc_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.push_constants); base_offset = pipeline_layout->push_constant_size + 16 * idx; stride = LLVMConstInt(ctx->ac.i32, 16, false); } else @@ -1275,7 +383,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, if (ctx->ac.chip_class >= GFX10) { desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -1284,7 +392,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, LLVMValueRef desc_components[4] = { LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""), - LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false), + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi), false), /* High limit to support variable sizes. */ LLVMConstInt(ctx->ac.i32, 0xffffffff, false), LLVMConstInt(ctx->ac.i32, desc_type, false), @@ -1320,9 +428,9 @@ static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context *ctx) uint32_t num_patches = ctx->tcs_num_patches; uint32_t num_tcs_outputs; if (ctx->stage == MESA_SHADER_TESS_CTRL) - num_tcs_outputs = util_last_bit64(ctx->shader_info->tcs.outputs_written); + num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); else - num_tcs_outputs = ctx->options->key.tes.tcs_num_outputs; + num_tcs_outputs = ctx->args->options->key.tes.tcs_num_outputs; uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; @@ -1491,6 +599,7 @@ store_tcs_output(struct ac_shader_abi *abi, LLVMValueRef dw_addr; LLVMValueRef stride = NULL; LLVMValueRef buf_addr = NULL; + LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds); unsigned param; bool store_lds = true; @@ -1547,14 +656,14 @@ store_tcs_output(struct ac_shader_abi *abi, if (!is_tess_factor && writemask != 0xF) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1, - buf_addr, ctx->oc_lds, - 4 * (base + chan), ac_glc, false); + buf_addr, oc_lds, + 4 * (base + chan), ac_glc); } if (writemask == 0xF) { ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4, - buf_addr, ctx->oc_lds, - (base * 4), ac_glc, false); + buf_addr, oc_lds, + (base * 4), ac_glc); } } @@ -1575,6 +684,7 @@ load_tes_input(struct ac_shader_abi *abi, struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef buf_addr; LLVMValueRef result; + LLVMValueRef oc_lds = ac_get_arg(&ctx->ac, ctx->args->oc_lds); unsigned param = shader_io_get_unique_index(location); if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) { @@ -1593,11 +703,23 @@ load_tes_input(struct ac_shader_abi *abi, buf_addr = LLVMBuildAdd(ctx->ac.builder, buf_addr, comp_offset, ""); result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL, - buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false); + buf_addr, oc_lds, is_compact ? (4 * const_index) : 0, ac_glc, true, false); result = ac_trim_vector(&ctx->ac, result, num_components); return result; } +static LLVMValueRef +radv_emit_fetch_64bit(struct radv_shader_context *ctx, + LLVMTypeRef type, LLVMValueRef a, LLVMValueRef b) +{ + LLVMValueRef values[2] = { + ac_to_integer(&ctx->ac, a), + ac_to_integer(&ctx->ac, b), + }; + LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); + return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); +} + static LLVMValueRef load_gs_input(struct ac_shader_abi *abi, unsigned location, @@ -1626,6 +748,14 @@ load_gs_input(struct ac_shader_abi *abi, dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), ""); value[i] = ac_lds_load(&ctx->ac, dw_addr); + + if (ac_get_type_size(type) == 8) { + dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, + LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index + 1, 0), ""); + LLVMValueRef tmp = ac_lds_load(&ctx->ac, dw_addr); + + value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp); + } } else { LLVMValueRef soffset = LLVMConstInt(ctx->ac.i32, @@ -1637,6 +767,21 @@ load_gs_input(struct ac_shader_abi *abi, ctx->ac.i32_0, vtx_offset, soffset, 0, ac_glc, true, false); + + if (ac_get_type_size(type) == 8) { + soffset = LLVMConstInt(ctx->ac.i32, + (param * 4 + i + const_index + 1) * 256, + false); + + LLVMValueRef tmp = + ac_build_buffer_load(&ctx->ac, + ctx->esgs_ring, 1, + ctx->ac.i32_0, + vtx_offset, soffset, + 0, ac_glc, true, false); + + value[i] = radv_emit_fetch_64bit(ctx, type, value[i], tmp); + } } if (ac_get_type_size(type) == 2) { @@ -1650,13 +795,6 @@ load_gs_input(struct ac_shader_abi *abi, return result; } - -static void radv_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible) -{ - struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - ac_build_kill_if_false(&ctx->ac, visible); -} - static uint32_t radv_get_sample_pos_offset(uint32_t num_samples) { @@ -1691,7 +829,7 @@ static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, ac_array_in_const_addr_space(ctx->ac.v2f32), ""); uint32_t sample_pos_offset = - radv_get_sample_pos_offset(ctx->options->key.fs.num_samples); + radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples); sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id, @@ -1707,11 +845,11 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); uint8_t log2_ps_iter_samples; - if (ctx->shader_info->ps.force_persample) { + if (ctx->args->shader_info->ps.force_persample) { log2_ps_iter_samples = - util_logbase2(ctx->options->key.fs.num_samples); + util_logbase2(ctx->args->options->key.fs.num_samples); } else { - log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples; + log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples; } /* The bit pattern matches that used by fixed function fragment @@ -1728,49 +866,36 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples]; LLVMValueRef result, sample_id; - sample_id = ac_unpack_param(&ctx->ac, abi->ancillary, 8, 4); + sample_id = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.ancillary), 8, 4); sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, ""); - result = LLVMBuildAnd(ctx->ac.builder, sample_id, abi->sample_coverage, ""); + result = LLVMBuildAnd(ctx->ac.builder, sample_id, + ac_get_arg(&ctx->ac, ctx->args->ac.sample_coverage), ""); return result; } static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs); static void -visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs) +visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs) { - LLVMValueRef gs_next_vertex; - LLVMValueRef can_emit; unsigned offset = 0; struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - if (ctx->options->key.vs_common_out.as_ngg) { - gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); + if (ctx->args->options->key.vs_common_out.as_ngg) { + gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs); return; } - /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, - ctx->gs_next_vertex[stream], - ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, kill it: excessive vertex emissions are not supposed to - * have any effect, and GS threads have no externally observable - * effects other than emitting vertices. - */ - can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - ac_build_kill_if_false(&ctx->ac, can_emit); - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - ctx->shader_info->gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; int length = util_last_bit(output_usage_mask); @@ -1790,7 +915,7 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr offset++; - voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); + voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, ""); voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), ""); out_val = ac_to_integer(&ctx->ac, out_val); @@ -1799,15 +924,13 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, - voffset, ctx->gs2vs_offset, 0, - ac_glc | ac_slc, true); + voffset, + ac_get_arg(&ctx->ac, + ctx->args->gs2vs_offset), + 0, ac_glc | ac_slc | ac_swizzled); } } - gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, - ctx->ac.i32_1, ""); - LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id); @@ -1818,7 +941,7 @@ visit_end_primitive(struct ac_shader_abi *abi, unsigned stream) { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - if (ctx->options->key.vs_common_out.as_ngg) { + if (ctx->args->options->key.vs_common_out.as_ngg) { LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); return; } @@ -1832,8 +955,8 @@ load_tess_coord(struct ac_shader_abi *abi) struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef coord[4] = { - ctx->tes_u, - ctx->tes_v, + ac_get_arg(&ctx->ac, ctx->args->tes_u), + ac_get_arg(&ctx->ac, ctx->args->tes_v), ctx->ac.f32_0, ctx->ac.f32_0, }; @@ -1849,13 +972,14 @@ static LLVMValueRef load_patch_vertices_in(struct ac_shader_abi *abi) { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - return LLVMConstInt(ctx->ac.i32, ctx->options->key.tcs.input_vertices, false); + return LLVMConstInt(ctx->ac.i32, ctx->args->options->key.tcs.input_vertices, false); } static LLVMValueRef radv_load_base_vertex(struct ac_shader_abi *abi) { - return abi->base_vertex; + struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); + return ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex); } static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi, @@ -1901,7 +1025,7 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef list = ctx->descriptor_sets[descriptor_set]; - struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout; + struct radv_descriptor_set_layout *layout = ctx->args->options->layout->set[descriptor_set].layout; struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index; unsigned offset = binding->offset; unsigned stride = binding->size; @@ -2039,35 +1163,6 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, ""); } -static unsigned -get_num_channels_from_data_format(unsigned data_format) -{ - switch (data_format) { - case V_008F0C_BUF_DATA_FORMAT_8: - case V_008F0C_BUF_DATA_FORMAT_16: - case V_008F0C_BUF_DATA_FORMAT_32: - return 1; - case V_008F0C_BUF_DATA_FORMAT_8_8: - case V_008F0C_BUF_DATA_FORMAT_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32: - return 2; - case V_008F0C_BUF_DATA_FORMAT_10_11_11: - case V_008F0C_BUF_DATA_FORMAT_11_11_10: - case V_008F0C_BUF_DATA_FORMAT_32_32_32: - return 3; - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: - case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: - return 4; - default: - break; - } - - return 4; -} - static LLVMValueRef radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value, @@ -2089,10 +1184,8 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, for (unsigned i = 0; i < num_channels; i++) chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i); } else { - if (num_channels) { - assert(num_channels == 1); - chan[0] = value; - } + assert(num_channels == 1); + chan[0] = value; } for (unsigned i = num_channels; i < 4; i++) { @@ -2107,14 +1200,14 @@ static void handle_vs_input_decl(struct radv_shader_context *ctx, struct nir_variable *variable) { - LLVMValueRef t_list_ptr = ctx->vertex_buffers; + LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->vertex_buffers); LLVMValueRef t_offset; LLVMValueRef t_list; LLVMValueRef input; LLVMValueRef buffer_index; unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); uint8_t input_usage_mask = - ctx->shader_info->vs.input_usage_mask[variable->data.location]; + ctx->args->shader_info->vs.input_usage_mask[variable->data.location]; unsigned num_input_channels = util_last_bit(input_usage_mask); variable->data.driver_location = variable->data.location * 4; @@ -2123,14 +1216,14 @@ handle_vs_input_decl(struct radv_shader_context *ctx, for (unsigned i = 0; i < attrib_count; ++i) { LLVMValueRef output[4]; unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0; - unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index]; + unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index]; unsigned data_format = attrib_format & 0x0f; unsigned num_format = (attrib_format >> 4) & 0x07; bool is_float = num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT; - if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { - uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index]; + if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { + uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index]; if (divisor) { buffer_index = ctx->abi.instance_id; @@ -2143,50 +1236,100 @@ handle_vs_input_decl(struct radv_shader_context *ctx, buffer_index = ctx->ac.i32_0; } - buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, ""); - } else - buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, - ctx->abi.base_vertex, ""); + buffer_index = LLVMBuildAdd(ctx->ac.builder, + ac_get_arg(&ctx->ac, + ctx->args->ac.start_instance),\ + buffer_index, ""); + } else { + buffer_index = LLVMBuildAdd(ctx->ac.builder, + ctx->abi.vertex_id, + ac_get_arg(&ctx->ac, + ctx->args->ac.base_vertex), ""); + } + + const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format); /* Adjust the number of channels to load based on the vertex * attribute format. */ - unsigned num_format_channels = get_num_channels_from_data_format(data_format); - unsigned num_channels = MIN2(num_input_channels, num_format_channels); - unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index]; - unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index]; - unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index]; + unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels); + unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index]; + unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index]; + unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index]; - if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) { /* Always load, at least, 3 channels for formats that * need to be shuffled because X<->Z. */ num_channels = MAX2(num_channels, 3); } - if (attrib_stride != 0 && attrib_offset > attrib_stride) { - LLVMValueRef buffer_offset = - LLVMConstInt(ctx->ac.i32, - attrib_offset / attrib_stride, false); + t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); + t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); - buffer_index = LLVMBuildAdd(ctx->ac.builder, - buffer_index, - buffer_offset, ""); + /* Perform per-channel vertex fetch operations if unaligned + * access are detected. Only GFX6 and GFX10 are affected. + */ + bool unaligned_vertex_fetches = false; + if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class == GFX10) && + vtx_info->chan_format != data_format && + ((attrib_offset % vtx_info->element_size) || + (attrib_stride % vtx_info->element_size))) + unaligned_vertex_fetches = true; + + if (unaligned_vertex_fetches) { + unsigned chan_format = vtx_info->chan_format; + LLVMValueRef values[4]; - attrib_offset = attrib_offset % attrib_stride; - } + assert(ctx->ac.chip_class == GFX6 || + ctx->ac.chip_class == GFX10); - t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + for (unsigned chan = 0; chan < num_channels; chan++) { + unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size; + LLVMValueRef chan_index = buffer_index; + + if (attrib_stride != 0 && chan_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + chan_offset / attrib_stride, false); - input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, - buffer_index, - LLVMConstInt(ctx->ac.i32, attrib_offset, false), - ctx->ac.i32_0, ctx->ac.i32_0, - num_channels, - data_format, num_format, 0, true); + chan_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); - if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + chan_offset = chan_offset % attrib_stride; + } + + values[chan] = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + chan_index, + LLVMConstInt(ctx->ac.i32, chan_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, 1, + chan_format, num_format, 0, true); + } + + input = ac_build_gather_values(&ctx->ac, values, num_channels); + } else { + if (attrib_stride != 0 && attrib_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + attrib_offset / attrib_stride, false); + + buffer_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); + + attrib_offset = attrib_offset % attrib_stride; + } + + input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + buffer_index, + LLVMConstInt(ctx->ac.i32, attrib_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, + num_channels, + data_format, num_format, 0, true); + } + + if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) { LLVMValueRef c[4]; c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2); c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1); @@ -2208,7 +1351,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx, } } - unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3; + unsigned alpha_adjust = (ctx->args->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3; output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]); for (unsigned chan = 0; chan < 4; chan++) { @@ -2245,10 +1388,21 @@ prepare_interp_optimize(struct radv_shader_context *ctx, uses_center = true; } + ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.persp_centroid); + ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args->ac.linear_centroid); + if (uses_center && uses_centroid) { - LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, ctx->abi.prim_mask, ctx->ac.i32_0, ""); - ctx->abi.persp_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->abi.persp_center, ctx->abi.persp_centroid, ""); - ctx->abi.linear_centroid = LLVMBuildSelect(ctx->ac.builder, sel, ctx->abi.linear_center, ctx->abi.linear_centroid, ""); + LLVMValueRef sel = LLVMBuildICmp(ctx->ac.builder, LLVMIntSLT, + ac_get_arg(&ctx->ac, ctx->args->ac.prim_mask), + ctx->ac.i32_0, ""); + ctx->abi.persp_centroid = + LLVMBuildSelect(ctx->ac.builder, sel, + ac_get_arg(&ctx->ac, ctx->args->ac.persp_center), + ctx->abi.persp_centroid, ""); + ctx->abi.linear_centroid = + LLVMBuildSelect(ctx->ac.builder, sel, + ac_get_arg(&ctx->ac, ctx->args->ac.linear_center), + ctx->abi.linear_centroid, ""); } } @@ -2312,9 +1466,9 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { unsigned index = target - V_008DFC_SQ_EXP_MRT; - unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; - bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; - bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1; + unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf; + bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1; + bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1; unsigned chan; LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL; @@ -2510,7 +1664,7 @@ radv_emit_stream_output(struct radv_shader_context *ctx, ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf], ctx->ac.i32_0, offset, - ac_glc | ac_slc, false); + ac_glc | ac_slc); } static void @@ -2519,9 +1673,10 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) int i; /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ - assert(ctx->streamout_config); + assert(ctx->args->streamout_config.used); LLVMValueRef so_vtx_count = - ac_build_bfe(&ctx->ac, ctx->streamout_config, + ac_build_bfe(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->streamout_config), LLVMConstInt(ctx->ac.i32, 16, false), LLVMConstInt(ctx->ac.i32, 7, false), false); @@ -2542,7 +1697,8 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) * (streamout_write_index + thread_id)*stride[buffer_id] + * attrib_offset */ - LLVMValueRef so_write_index = ctx->streamout_write_idx; + LLVMValueRef so_write_index = + ac_get_arg(&ctx->ac, ctx->args->streamout_write_idx); /* Compute (streamout_write_index + thread_id). */ so_write_index = @@ -2553,10 +1709,10 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) */ LLVMValueRef so_write_offset[4] = {}; LLVMValueRef so_buffers[4] = {}; - LLVMValueRef buf_ptr = ctx->streamout_buffers; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers); for (i = 0; i < 4; i++) { - uint16_t stride = ctx->shader_info->so.strides[i]; + uint16_t stride = ctx->args->shader_info->so.strides[i]; if (!stride) continue; @@ -2567,7 +1723,8 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - LLVMValueRef so_offset = ctx->streamout_offset[i]; + LLVMValueRef so_offset = + ac_get_arg(&ctx->ac, ctx->args->streamout_offset[i]); so_offset = LLVMBuildMul(ctx->ac.builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, false), ""); @@ -2580,10 +1737,10 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) } /* Write streamout data. */ - for (i = 0; i < ctx->shader_info->so.num_outputs; i++) { + for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) { struct radv_shader_output_values shader_out = {}; struct radv_stream_output *output = - &ctx->shader_info->so.outputs[i]; + &ctx->args->shader_info->so.outputs[i]; if (stream != output->stream) continue; @@ -2615,6 +1772,7 @@ radv_build_param_exports(struct radv_shader_context *ctx, if (slot_name != VARYING_SLOT_LAYER && slot_name != VARYING_SLOT_PRIMITIVE_ID && + slot_name != VARYING_SLOT_VIEWPORT && slot_name != VARYING_SLOT_CLIP_DIST0 && slot_name != VARYING_SLOT_CLIP_DIST1 && slot_name < VARYING_SLOT_VAR0) @@ -2708,7 +1866,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, if (outinfo->writes_layer == true) pos_args[1].out[2] = layer_value; if (outinfo->writes_viewport_index == true) { - if (ctx->options->chip_class >= GFX9) { + if (ctx->args->options->chip_class >= GFX9) { /* GFX9 has the layer in out.z[10:0] and the viewport * index in out.z[19:16]. */ @@ -2734,12 +1892,10 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, outinfo->pos_exports++; } - /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. * Setting valid_mask=1 prevents it and has no other effect. */ - if (ctx->ac.family == CHIP_NAVI10 || - ctx->ac.family == CHIP_NAVI12 || - ctx->ac.family == CHIP_NAVI14) + if (ctx->ac.chip_class == GFX10) pos_args[0].valid_mask = 1; pos_idx = 0; @@ -2770,7 +1926,7 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, struct radv_shader_output_values *outputs; unsigned noutput = 0; - if (ctx->options->key.has_multiview_view_index) { + if (ctx->args->options->key.has_multiview_view_index) { LLVMValueRef* tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]; if(!*tmp_out) { for(unsigned i = 0; i < 4; ++i) @@ -2778,7 +1934,8 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); } - LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, ctx->abi.view_index), *tmp_out); + LLVMValueRef view_index = ac_get_arg(&ctx->ac, ctx->args->ac.view_index); + LLVMBuildStore(ctx->ac.builder, ac_to_float(&ctx->ac, view_index), *tmp_out); ctx->output_mask |= 1ull << VARYING_SLOT_LAYER; } @@ -2786,8 +1943,9 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, sizeof(outinfo->vs_output_param_offset)); outinfo->pos_exports = 0; - if (ctx->shader_info->so.num_outputs && - !ctx->is_gs_copy_shader) { + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs && + !ctx->args->is_gs_copy_shader) { /* The GS copy shader emission already emits streamout. */ radv_emit_streamout(ctx, 0); } @@ -2804,16 +1962,16 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; if (ctx->stage == MESA_SHADER_VERTEX && - !ctx->is_gs_copy_shader) { + !ctx->args->is_gs_copy_shader) { outputs[noutput].usage_mask = - ctx->shader_info->vs.output_usage_mask[i]; + ctx->args->shader_info->vs.output_usage_mask[i]; } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { outputs[noutput].usage_mask = - ctx->shader_info->tes.output_usage_mask[i]; + ctx->args->shader_info->tes.output_usage_mask[i]; } else { - assert(ctx->is_gs_copy_shader); + assert(ctx->args->is_gs_copy_shader); outputs[noutput].usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; } for (unsigned j = 0; j < 4; j++) { @@ -2829,7 +1987,8 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, outputs[noutput].slot_name = VARYING_SLOT_PRIMITIVE_ID; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; - outputs[noutput].values[0] = ctx->vs_prim_id; + outputs[noutput].values[0] = + ac_get_arg(&ctx->ac, ctx->args->vs_prim_id); for (unsigned j = 1; j < 4; j++) outputs[noutput].values[j] = ctx->ac.f32_0; noutput++; @@ -2850,7 +2009,9 @@ handle_es_outputs_post(struct radv_shader_context *ctx, if (ctx->ac.chip_class >= GFX9) { unsigned itemsize_dw = outinfo->esgs_itemsize / 4; LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); - LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); + LLVMValueRef wave_idx = + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4); vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, LLVMBuildMul(ctx->ac.builder, wave_idx, LLVMConstInt(ctx->ac.i32, @@ -2870,11 +2031,11 @@ handle_es_outputs_post(struct radv_shader_context *ctx, if (ctx->stage == MESA_SHADER_VERTEX) { output_usage_mask = - ctx->shader_info->vs.output_usage_mask[i]; + ctx->args->shader_info->vs.output_usage_mask[i]; } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); output_usage_mask = - ctx->shader_info->tes.output_usage_mask[i]; + ctx->args->shader_info->tes.output_usage_mask[i]; } param_index = shader_io_get_unique_index(i); @@ -2904,248 +2065,728 @@ handle_es_outputs_post(struct radv_shader_context *ctx, ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, - NULL, ctx->es2gs_offset, + NULL, + ac_get_arg(&ctx->ac, ctx->args->es2gs_offset), (4 * param_index + j) * 4, - ac_glc | ac_slc, true); + ac_glc | ac_slc | ac_swizzled); + } + } + } +} + +static void +handle_ls_outputs_post(struct radv_shader_context *ctx) +{ + LLVMValueRef vertex_id = ctx->rel_auto_id; + uint32_t num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); + LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false); + LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, + vertex_dw_stride, ""); + + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; + + if (!(ctx->output_mask & (1ull << i))) + continue; + + int param = shader_io_get_unique_index(i); + LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, + LLVMConstInt(ctx->ac.i32, param * 4, false), + ""); + for (unsigned j = 0; j < 4; j++) { + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); + value = ac_to_integer(&ctx->ac, value); + value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); + ac_lds_store(&ctx->ac, dw_addr, value); + dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, ""); + } + } +} + +static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx) +{ + return ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4); +} + +static LLVMValueRef get_tgsize(struct radv_shader_context *ctx) +{ + return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 28, 4); +} + +static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); + return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); +} + +static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), + LLVMConstInt(ctx->ac.i32, 12, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), + LLVMConstInt(ctx->ac.i32, 22, false), + LLVMConstInt(ctx->ac.i32, 9, false), + false); +} + +static LLVMValueRef ngg_get_ordered_id(struct radv_shader_context *ctx) +{ + return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), + ctx->ac.i32_0, + LLVMConstInt(ctx->ac.i32, 12, false), + false); +} + +static LLVMValueRef +ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) +{ + unsigned num_outputs = util_bitcount64(ctx->output_mask); + + if (ctx->args->options->key.has_multiview_view_index) + num_outputs++; + + LLVMTypeRef elements[2] = { + LLVMArrayType(ctx->ac.i32, 4 * num_outputs), + LLVMArrayType(ctx->ac.i8, 4), + }; + LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); + type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); + return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); +} + +/** + * Return a pointer to the LDS storage reserved for the N'th vertex, where N + * is in emit order; that is: + * - during the epilogue, N is the threadidx (relative to the entire threadgroup) + * - during vertex emit, i.e. while the API GS shader invocation is running, + * N = threadidx * gs_max_out_vertices + emitidx + * + * Goals of the LDS memory layout: + * 1. Eliminate bank conflicts on write for geometry shaders that have all emits + * in uniform control flow + * 2. Eliminate bank conflicts on read for export if, additionally, there is no + * culling + * 3. Agnostic to the number of waves (since we don't know it before compiling) + * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.) + * 5. Avoid wasting memory. + * + * We use an AoS layout due to point 4 (this also helps point 3). In an AoS + * layout, elimination of bank conflicts requires that each vertex occupy an + * odd number of dwords. We use the additional dword to store the output stream + * index as well as a flag to indicate whether this vertex ends a primitive + * for rasterization. + * + * Swizzling is required to satisfy points 1 and 2 simultaneously. + * + * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx). + * Indices are swizzled in groups of 32, which ensures point 1 without + * disturbing point 2. + * + * \return an LDS pointer to type {[N x i32], [4 x i8]} + */ +static LLVMValueRef +ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); + + /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ + unsigned write_stride_2exp = ffs(ctx->shader->info.gs.vertices_out) - 1; + if (write_stride_2exp) { + LLVMValueRef row = + LLVMBuildLShr(builder, vertexidx, + LLVMConstInt(ctx->ac.i32, 5, false), ""); + LLVMValueRef swizzle = + LLVMBuildAnd(builder, row, + LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, + false), ""); + vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); + } + + return ac_build_gep0(&ctx->ac, storage, vertexidx); +} + +static LLVMValueRef +ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, + LLVMValueRef emitidx) +{ + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef tmp; + + tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false); + tmp = LLVMBuildMul(builder, tmp, gsthread, ""); + const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); + return ngg_gs_vertex_ptr(ctx, vertexidx); +} + +static LLVMValueRef +ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned out_idx) +{ + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_0, /* first struct entry */ + LLVMConstInt(ctx->ac.i32, out_idx, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); +} + +static LLVMValueRef +ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned stream) +{ + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second struct entry */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); +} + +static struct radv_stream_output * +radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location) +{ + for (unsigned i = 0; i < so->num_outputs; ++i) { + if (so->outputs[i].location == location) + return &so->outputs[i]; + } + + return NULL; +} + +static void build_streamout_vertex(struct radv_shader_context *ctx, + LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw, + unsigned stream, LLVMValueRef offset_vtx, + LLVMValueRef vertexptr) +{ + struct radv_streamout_info *so = &ctx->args->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef offset[4] = {}; + LLVMValueRef tmp; + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (!wg_offset_dw[buffer]) + continue; + + tmp = LLVMBuildMul(builder, offset_vtx, + LLVMConstInt(ctx->ac.i32, so->strides[buffer], false), ""); + tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, ""); + offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), ""); + } + + if (ctx->stage == MESA_SHADER_GEOMETRY) { + struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS]; + unsigned noutput = 0; + unsigned out_idx = 0; + + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + unsigned output_usage_mask = + ctx->args->shader_info->gs.output_usage_mask[i]; + uint8_t output_stream = + output_stream = ctx->args->shader_info->gs.output_streams[i]; + + if (!(ctx->output_mask & (1ull << i)) || + output_stream != stream) + continue; + + outputs[noutput].slot_name = i; + outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1; + outputs[noutput].usage_mask = output_usage_mask; + + int length = util_last_bit(output_usage_mask); + + for (unsigned j = 0; j < length; j++, out_idx++) { + if (!(output_usage_mask & (1 << j))) + continue; + + tmp = ac_build_gep0(&ctx->ac, vertexptr, + LLVMConstInt(ctx->ac.i32, out_idx, false)); + outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, ""); } + + for (unsigned j = length; j < 4; j++) + outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32); + + noutput++; + } + + for (unsigned i = 0; i < noutput; i++) { + struct radv_stream_output *output = + radv_get_stream_output_by_loc(so, outputs[i].slot_name); + + if (!output || + output->stream != stream) + continue; + + struct radv_shader_output_values out = {}; + + for (unsigned j = 0; j < 4; j++) { + out.values[j] = outputs[i].values[j]; + } + + radv_emit_stream_output(ctx, so_buffer, offset, output, &out); + } + } else { + for (unsigned i = 0; i < so->num_outputs; ++i) { + struct radv_stream_output *output = + &ctx->args->shader_info->so.outputs[i]; + + if (stream != output->stream) + continue; + + struct radv_shader_output_values out = {}; + + for (unsigned comp = 0; comp < 4; comp++) { + if (!(output->component_mask & (1 << comp))) + continue; + + tmp = ac_build_gep0(&ctx->ac, vertexptr, + LLVMConstInt(ctx->ac.i32, 4 * i + comp, false)); + out.values[comp] = LLVMBuildLoad(builder, tmp, ""); + } + + radv_emit_stream_output(ctx, so_buffer, offset, output, &out); + } + } +} + +struct ngg_streamout { + LLVMValueRef num_vertices; + + /* per-thread data */ + LLVMValueRef prim_enable[4]; /* i1 per stream */ + LLVMValueRef vertices[3]; /* [N x i32] addrspace(LDS)* */ + + /* Output */ + LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */ +}; + +/** + * Build streamout logic. + * + * Implies a barrier. + * + * Writes number of emitted primitives to gs_ngg_scratch[4:7]. + * + * Clobbers gs_ngg_scratch[8:]. + */ +static void build_streamout(struct radv_shader_context *ctx, + struct ngg_streamout *nggso) +{ + struct radv_streamout_info *so = &ctx->args->shader_info->so; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers); + LLVMValueRef tid = get_thread_id_in_tg(ctx); + LLVMValueRef cond, tmp, tmp2; + LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false); + LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false); + LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false); + LLVMValueRef so_buffer[4] = {}; + unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + + (nggso->vertices[2] ? 1 : 0); + LLVMValueRef prim_stride_dw[4] = {}; + LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32); + int stream_for_buffer[4] = { -1, -1, -1, -1 }; + unsigned bufmask_for_stream[4] = {}; + bool isgs = ctx->stage == MESA_SHADER_GEOMETRY; + unsigned scratch_emit_base = isgs ? 4 : 0; + LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0; + unsigned scratch_offset_base = isgs ? 8 : 4; + LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4; + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + /* Determine the mapping of streamout buffers to vertex streams. */ + for (unsigned i = 0; i < so->num_outputs; ++i) { + unsigned buf = so->outputs[i].buffer; + unsigned stream = so->outputs[i].stream; + assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream); + stream_for_buffer[buf] = stream; + bufmask_for_stream[stream] |= 1 << buf; + } + + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + assert(so->strides[buffer]); + + LLVMValueRef stride_for_buffer = + LLVMConstInt(ctx->ac.i32, so->strides[buffer], false); + prim_stride_dw[buffer] = + LLVMBuildMul(builder, stride_for_buffer, + nggso->num_vertices, ""); + prim_stride_dw_vgpr = ac_build_writelane( + &ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer], + LLVMConstInt(ctx->ac.i32, buffer, false)); + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false); + so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, + offset); + } + + cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, cond, 5200); + { + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + /* Advance the streamout offsets in GDS. */ + LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, ""); + + cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, cond, 5210); + { + /* Fetch the number of generated primitives and store + * it in GDS for later use. + */ + if (isgs) { + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid); + tmp = LLVMBuildLoad(builder, tmp, ""); + } else { + tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, + ngg_get_prim_cnt(ctx), ctx->ac.i32_0); + } + LLVMBuildStore(builder, tmp, generated_by_stream_vgpr); + + unsigned swizzle[4]; + int unused_stream = -1; + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) { + unused_stream = stream; + break; + } + } + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + swizzle[buffer] = stream_for_buffer[buffer]; + } else { + assert(unused_stream >= 0); + swizzle[buffer] = unused_stream; + } + } + + tmp = ac_build_quad_swizzle(&ctx->ac, tmp, + swizzle[0], swizzle[1], swizzle[2], swizzle[3]); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + + LLVMValueRef args[] = { + LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""), + tmp, + ctx->ac.i32_0, // ordering + ctx->ac.i32_0, // scope + ctx->ac.i1false, // isVolatile + LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index + ctx->ac.i1true, // wave release + ctx->ac.i1true, // wave done + }; + + tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", + ctx->ac.i32, args, ARRAY_SIZE(args), 0); + + /* Keep offsets in a VGPR for quick retrieval via readlane by + * the first wave for bounds checking, and also store in LDS + * for retrieval by all waves later. */ + LLVMBuildStore(builder, tmp, offsets_vgpr); + + tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), + scratch_offset_basev, ""); + tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2); + LLVMBuildStore(builder, tmp, tmp2); + } + ac_build_endif(&ctx->ac, 5210); + + /* Determine the max emit per buffer. This is done via the SALU, in part + * because LLVM can't generate divide-by-multiply if we try to do this + * via VALU with one lane per buffer. + */ + LLVMValueRef max_emit[4] = {}; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == -1) + continue; + + /* Compute the streamout buffer size in DWORD. */ + LLVMValueRef bufsize_dw = + LLVMBuildLShr(builder, + LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), + i32_2, ""); + + /* Load the streamout buffer offset from GDS. */ + tmp = LLVMBuildLoad(builder, offsets_vgpr, ""); + LLVMValueRef offset_dw = + ac_build_readlane(&ctx->ac, tmp, + LLVMConstInt(ctx->ac.i32, buffer, false)); + + /* Compute the remaining size to emit. */ + LLVMValueRef remaining_dw = + LLVMBuildSub(builder, bufsize_dw, offset_dw, ""); + tmp = LLVMBuildUDiv(builder, remaining_dw, + prim_stride_dw[buffer], ""); + + cond = LLVMBuildICmp(builder, LLVMIntULT, + bufsize_dw, offset_dw, ""); + max_emit[buffer] = LLVMBuildSelect(builder, cond, + ctx->ac.i32_0, tmp, ""); + } + + /* Determine the number of emitted primitives per stream and fixup the + * GDS counter if necessary. + * + * This is complicated by the fact that a single stream can emit to + * multiple buffers (but luckily not vice versa). + */ + LLVMValueRef emit_vgpr = ctx->ac.i32_0; + + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; + + /* Load the number of generated primitives from GDS and + * determine that number for the given stream. + */ + tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, ""); + LLVMValueRef generated = + ac_build_readlane(&ctx->ac, tmp, + LLVMConstInt(ctx->ac.i32, stream, false)); + + + /* Compute the number of emitted primitives. */ + LLVMValueRef emit = generated; + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] == stream) + emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]); + } + + /* Store the number of emitted primitives for that + * stream. + */ + emit_vgpr = ac_build_writelane(&ctx->ac, emit_vgpr, emit, + LLVMConstInt(ctx->ac.i32, stream, false)); + + /* Fixup the offset using a plain GDS atomic if we overflowed. */ + cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, ""); + ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */ + tmp = LLVMBuildLShr(builder, + LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false), + ac_get_thread_id(&ctx->ac), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5222); + { + tmp = LLVMBuildSub(builder, generated, emit, ""); + tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, ""); + tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, ""); + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp, + LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5222); + ac_build_endif(&ctx->ac, 5221); + } + + /* Store the number of emitted primitives to LDS for later use. */ + cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, ""); + ac_build_ifcc(&ctx->ac, cond, 5225); + { + tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), + scratch_emit_basev, ""); + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp); + LLVMBuildStore(builder, emit_vgpr, tmp); + } + ac_build_endif(&ctx->ac, 5225); + } + ac_build_endif(&ctx->ac, 5200); + + /* Determine the workgroup-relative per-thread / primitive offset into + * the streamout buffers */ + struct ac_wg_scan primemit_scan[4] = {}; + + if (isgs) { + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; + + primemit_scan[stream].enable_exclusive = true; + primemit_scan[stream].op = nir_op_iadd; + primemit_scan[stream].src = nggso->prim_enable[stream]; + primemit_scan[stream].scratch = + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false)); + primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx); + primemit_scan[stream].numwaves = get_tgsize(ctx); + primemit_scan[stream].maxwaves = 8; + ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]); } } -} -static void -handle_ls_outputs_post(struct radv_shader_context *ctx) -{ - LLVMValueRef vertex_id = ctx->rel_auto_id; - uint32_t num_tcs_inputs = util_last_bit64(ctx->shader_info->vs.ls_outputs_written); - LLVMValueRef vertex_dw_stride = LLVMConstInt(ctx->ac.i32, num_tcs_inputs * 4, false); - LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, - vertex_dw_stride, ""); + ac_build_s_barrier(&ctx->ac); - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; + /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */ + LLVMValueRef wgoffset_dw[4] = {}; - if (!(ctx->output_mask & (1ull << i))) - continue; + { + LLVMValueRef scratch_vgpr; - int param = shader_io_get_unique_index(i); - LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, - LLVMConstInt(ctx->ac.i32, param * 4, false), - ""); - for (unsigned j = 0; j < 4; j++) { - LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - value = ac_to_integer(&ctx->ac, value); - value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); - ac_lds_store(&ctx->ac, dw_addr, value); - dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, ""); - } - } -} + tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac)); + scratch_vgpr = LLVMBuildLoad(builder, tmp, ""); -static LLVMValueRef get_wave_id_in_tg(struct radv_shader_context *ctx) -{ - return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); -} + for (unsigned buffer = 0; buffer < 4; ++buffer) { + if (stream_for_buffer[buffer] >= 0) { + wgoffset_dw[buffer] = ac_build_readlane( + &ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false)); + } + } -static LLVMValueRef get_tgsize(struct radv_shader_context *ctx) -{ - return ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 28, 4); -} + for (unsigned stream = 0; stream < 4; ++stream) { + if (ctx->args->shader_info->gs.num_stream_output_components[stream]) { + nggso->emit[stream] = ac_build_readlane( + &ctx->ac, scratch_vgpr, + LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false)); + } + } + } -static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx), - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""); - return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), ""); -} + /* Write out primitive data */ + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; -static LLVMValueRef ngg_get_vtx_cnt(struct radv_shader_context *ctx) -{ - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, - LLVMConstInt(ctx->ac.i32, 12, false), - LLVMConstInt(ctx->ac.i32, 9, false), - false); -} + if (isgs) { + ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]); + } else { + primemit_scan[stream].result_exclusive = tid; + } -static LLVMValueRef ngg_get_prim_cnt(struct radv_shader_context *ctx) -{ - return ac_build_bfe(&ctx->ac, ctx->gs_tg_info, - LLVMConstInt(ctx->ac.i32, 22, false), - LLVMConstInt(ctx->ac.i32, 9, false), - false); + cond = LLVMBuildICmp(builder, LLVMIntULT, + primemit_scan[stream].result_exclusive, + nggso->emit[stream], ""); + cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], ""); + ac_build_ifcc(&ctx->ac, cond, 5240); + { + LLVMValueRef offset_vtx = + LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, + nggso->num_vertices, ""); + + for (unsigned i = 0; i < max_num_vertices; ++i) { + cond = LLVMBuildICmp(builder, LLVMIntULT, + LLVMConstInt(ctx->ac.i32, i, false), + nggso->num_vertices, ""); + ac_build_ifcc(&ctx->ac, cond, 5241); + build_streamout_vertex(ctx, so_buffer, wgoffset_dw, + stream, offset_vtx, nggso->vertices[i]); + ac_build_endif(&ctx->ac, 5241); + offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, ""); + } + } + ac_build_endif(&ctx->ac, 5240); + } } -static LLVMValueRef -ngg_gs_get_vertex_storage(struct radv_shader_context *ctx) +static unsigned ngg_nogs_vertex_size(struct radv_shader_context *ctx) { - unsigned num_outputs = util_bitcount64(ctx->output_mask); + unsigned lds_vertex_size = 0; - if (ctx->options->key.has_multiview_view_index) - num_outputs++; + if (ctx->args->shader_info->so.num_outputs) + lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1; - LLVMTypeRef elements[2] = { - LLVMArrayType(ctx->ac.i32, 4 * num_outputs), - LLVMArrayType(ctx->ac.i8, 4), - }; - LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false); - type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS); - return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, ""); + return lds_vertex_size; } /** - * Return a pointer to the LDS storage reserved for the N'th vertex, where N - * is in emit order; that is: - * - during the epilogue, N is the threadidx (relative to the entire threadgroup) - * - during vertex emit, i.e. while the API GS shader invocation is running, - * N = threadidx * gs_max_out_vertices + emitidx - * - * Goals of the LDS memory layout: - * 1. Eliminate bank conflicts on write for geometry shaders that have all emits - * in uniform control flow - * 2. Eliminate bank conflicts on read for export if, additionally, there is no - * culling - * 3. Agnostic to the number of waves (since we don't know it before compiling) - * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.) - * 5. Avoid wasting memory. - * - * We use an AoS layout due to point 4 (this also helps point 3). In an AoS - * layout, elimination of bank conflicts requires that each vertex occupy an - * odd number of dwords. We use the additional dword to store the output stream - * index as well as a flag to indicate whether this vertex ends a primitive - * for rasterization. - * - * Swizzling is required to satisfy points 1 and 2 simultaneously. - * - * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx). - * Indices are swizzled in groups of 32, which ensures point 1 without - * disturbing point 2. - * - * \return an LDS pointer to type {[N x i32], [4 x i8]} + * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage + * for the vertex outputs. */ -static LLVMValueRef -ngg_gs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexidx) +static LLVMValueRef ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, + LLVMValueRef vtxid) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx); - - /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */ - unsigned write_stride_2exp = ffs(ctx->shader->info.gs.vertices_out) - 1; - if (write_stride_2exp) { - LLVMValueRef row = - LLVMBuildLShr(builder, vertexidx, - LLVMConstInt(ctx->ac.i32, 5, false), ""); - LLVMValueRef swizzle = - LLVMBuildAnd(builder, row, - LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, - false), ""); - vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, ""); - } - - return ac_build_gep0(&ctx->ac, storage, vertexidx); + /* The extra dword is used to avoid LDS bank conflicts. */ + unsigned vertex_size = ngg_nogs_vertex_size(ctx); + LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size); + LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS); + LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, ""); + return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, ""); } -static LLVMValueRef -ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, - LLVMValueRef emitidx) +static void +handle_ngg_outputs_post_1(struct radv_shader_context *ctx) { + struct radv_streamout_info *so = &ctx->args->shader_info->so; LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - - tmp = LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false); - tmp = LLVMBuildMul(builder, tmp, gsthread, ""); - const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, ""); - return ngg_gs_vertex_ptr(ctx, vertexidx); -} + LLVMValueRef vertex_ptr = NULL; + LLVMValueRef tmp, tmp2; -/* Send GS Alloc Req message from the first wave of the group to SPI. - * Message payload is: - * - bits 0..10: vertices in group - * - bits 12..22: primitives in group - */ -static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx, - LLVMValueRef vtx_cnt, - LLVMValueRef prim_cnt) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; + assert((ctx->stage == MESA_SHADER_VERTEX || + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader); - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5020); + if (!ctx->args->shader_info->so.num_outputs) + return; - tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),""); - tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp); + vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx)); - ac_build_endif(&ctx->ac, 5020); -} + for (unsigned i = 0; i < so->num_outputs; ++i) { + struct radv_stream_output *output = + &ctx->args->shader_info->so.outputs[i]; -struct ngg_prim { - unsigned num_vertices; - LLVMValueRef isnull; - LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; -}; + unsigned loc = output->location; -static void build_export_prim(struct radv_shader_context *ctx, - const struct ngg_prim *prim) -{ - LLVMBuilderRef builder = ctx->ac.builder; - struct ac_export_args args; - LLVMValueRef tmp; + for (unsigned comp = 0; comp < 4; comp++) { + if (!(output->component_mask & (1 << comp))) + continue; - tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, ""); - args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), ""); - - for (unsigned i = 0; i < prim->num_vertices; ++i) { - tmp = LLVMBuildShl(builder, prim->index[i], - LLVMConstInt(ctx->ac.i32, 10 * i, false), ""); - args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, ""); - tmp = LLVMBuildShl(builder, tmp, - LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), ""); - args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, ""); + tmp = ac_build_gep0(&ctx->ac, vertex_ptr, + LLVMConstInt(ctx->ac.i32, 4 * i + comp, false)); + tmp2 = LLVMBuildLoad(builder, + ctx->abi.outputs[4 * loc + comp], ""); + tmp2 = ac_to_integer(&ctx->ac, tmp2); + LLVMBuildStore(builder, tmp2, tmp); + } } - - args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, ""); - args.out[1] = LLVMGetUndef(ctx->ac.f32); - args.out[2] = LLVMGetUndef(ctx->ac.f32); - args.out[3] = LLVMGetUndef(ctx->ac.f32); - - args.target = V_008DFC_SQ_EXP_PRIM; - args.enabled_channels = 1; - args.done = true; - args.valid_mask = false; - args.compr = false; - - ac_build_export(&ctx->ac, &args); } static void -handle_ngg_outputs_post(struct radv_shader_context *ctx) +handle_ngg_outputs_post_2(struct radv_shader_context *ctx) { LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; assert((ctx->stage == MESA_SHADER_VERTEX || - ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->is_gs_copy_shader); + ctx->stage == MESA_SHADER_TESS_EVAL) && !ctx->args->is_gs_copy_shader); - LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); - LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 0, 8); + LLVMValueRef prims_in_wave = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8); + LLVMValueRef vtx_in_wave = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 0, 8); LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), prims_in_wave, ""); LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), vtx_in_wave, ""); LLVMValueRef vtxindex[] = { - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 0, 16), - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[0], 16, 16), - ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[2], 0, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 0, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]), 16, 16), + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[2]), 0, 16), }; /* Determine the number of vertices per primitive. */ unsigned num_vertices; + LLVMValueRef num_vertices_val; if (ctx->stage == MESA_SHADER_VERTEX) { + LLVMValueRef outprim_val = + LLVMConstInt(ctx->ac.i32, + ctx->args->options->key.vs.outprim, false); + num_vertices_val = LLVMBuildAdd(builder, outprim_val, + ctx->ac.i32_1, ""); num_vertices = 3; /* TODO: optimize for points & lines */ } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); @@ -3156,16 +2797,30 @@ handle_ngg_outputs_post(struct radv_shader_context *ctx) num_vertices = 2; else num_vertices = 3; + + num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false); } - /* TODO: streamout */ + /* Streamout */ + if (ctx->args->shader_info->so.num_outputs) { + struct ngg_streamout nggso = {}; + + nggso.num_vertices = num_vertices_val; + nggso.prim_enable[0] = is_gs_thread; + + for (unsigned i = 0; i < num_vertices; ++i) + nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]); + + build_streamout(ctx, &nggso); + } /* Copy Primitive IDs from GS threads to the LDS address corresponding * to the ES thread of the provoking vertex. */ if (ctx->stage == MESA_SHADER_VERTEX && - ctx->options->key.vs_common_out.export_prim_id) { - /* TODO: streamout */ + ctx->args->options->key.vs_common_out.export_prim_id) { + if (ctx->args->shader_info->so.num_outputs) + ac_build_s_barrier(&ctx->ac); ac_build_ifcc(&ctx->ac, is_gs_thread, 5400); /* Extract the PROVOKING_VTX_INDEX field. */ @@ -3177,24 +2832,18 @@ handle_ngg_outputs_post(struct radv_shader_context *ctx) LLVMValueRef provoking_vtx_index = LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, ""); - LLVMBuildStore(builder, ctx->abi.gs_prim_id, + LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_prim_id), ac_build_gep0(&ctx->ac, ctx->esgs_ring, provoking_vtx_index)); ac_build_endif(&ctx->ac, 5400); } /* TODO: primitive culling */ - build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); /* TODO: streamout queries */ - /* Export primitive data to the index buffer. Format is: - * - bits 0..8: index 0 - * - bit 9: edge flag 0 - * - bits 10..18: index 1 - * - bit 19: edge flag 1 - * - bits 20..28: index 2 - * - bit 29: edge flag 2 - * - bit 31: null primitive (skip) + /* Export primitive data to the index buffer. * * For the first version, we will always build up all three indices * independent of the primitive type. The additional garbage data @@ -3205,19 +2854,24 @@ handle_ngg_outputs_post(struct radv_shader_context *ctx) */ ac_build_ifcc(&ctx->ac, is_gs_thread, 6001); { - struct ngg_prim prim = {}; + struct ac_ngg_prim prim = {}; - prim.num_vertices = num_vertices; - prim.isnull = ctx->ac.i1false; - memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); - - for (unsigned i = 0; i < num_vertices; ++i) { - tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id, - LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); - prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) { + prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]); + } else { + prim.num_vertices = num_vertices; + prim.isnull = ctx->ac.i1false; + memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + + for (unsigned i = 0; i < num_vertices; ++i) { + tmp = LLVMBuildLShr(builder, + ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); + prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + } } - build_export_prim(ctx, &prim); + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 6001); @@ -3225,15 +2879,16 @@ handle_ngg_outputs_post(struct radv_shader_context *ctx) ac_build_ifcc(&ctx->ac, is_es_thread, 6002); { struct radv_vs_output_info *outinfo = - ctx->stage == MESA_SHADER_TESS_EVAL ? &ctx->shader_info->tes.outinfo : &ctx->shader_info->vs.outinfo; + ctx->stage == MESA_SHADER_TESS_EVAL ? + &ctx->args->shader_info->tes.outinfo : &ctx->args->shader_info->vs.outinfo; /* Exporting the primitive ID is handled below. */ /* TODO: use the new VS export path */ handle_vs_outputs_post(ctx, false, - ctx->options->key.vs_common_out.export_clip_dists, + ctx->args->options->key.vs_common_out.export_clip_dists, outinfo); - if (ctx->options->key.vs_common_out.export_prim_id) { + if (ctx->args->options->key.vs_common_out.export_prim_id) { unsigned param_count = outinfo->param_exports; LLVMValueRef values[4]; @@ -3246,7 +2901,7 @@ handle_ngg_outputs_post(struct radv_shader_context *ctx) values[0] = LLVMBuildLoad(builder, tmp, ""); } else { assert(ctx->stage == MESA_SHADER_TESS_EVAL); - values[0] = ctx->abi.tes_patch_id; + values[0] = ac_get_arg(&ctx->ac, ctx->args->ac.tes_patch_id); } values[0] = ac_to_float(&ctx->ac, values[0]); @@ -3307,7 +2962,7 @@ static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) unsigned num_components; num_components = - ctx->shader_info->gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; @@ -3327,16 +2982,35 @@ static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second entry of struct */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - LLVMBuildStore(builder, i8_0, tmp); + LLVMBuildStore(builder, i8_0, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); ac_build_endloop(&ctx->ac, 5100); } + + /* Accumulate generated primitives counts across the entire threadgroup. */ + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + + num_components = + ctx->args->shader_info->gs.num_stream_output_components[stream]; + if (!num_components) + continue; + + LLVMValueRef numprims = + LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); + numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size); + + tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, ""); + ac_build_ifcc(&ctx->ac, tmp, 5105); + { + LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, + ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, + LLVMConstInt(ctx->ac.i32, stream, false)), + numprims, LLVMAtomicOrderingMonotonic, false); + } + ac_build_endif(&ctx->ac, 5105); + } } static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) @@ -3350,7 +3024,60 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) const LLVMValueRef tid = get_thread_id_in_tg(ctx); LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx); - /* TODO: streamout */ + /* Streamout */ + if (ctx->args->shader_info->so.num_outputs) { + struct ngg_streamout nggso = {}; + + nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false); + + LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid); + for (unsigned stream = 0; stream < 4; ++stream) { + if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) + continue; + + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), ""); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); + nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); + } + + for (unsigned i = 0; i < verts_per_prim; ++i) { + tmp = LLVMBuildSub(builder, tid, + LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), ""); + tmp = ngg_gs_vertex_ptr(ctx, tmp); + nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0); + } + + build_streamout(ctx, &nggso); + } + + /* Write shader query data. */ + tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + /* Use a plain GDS atomic to accumulate the number of generated + * primitives. + */ + ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, + tmp, sync_scope); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); /* TODO: culling */ @@ -3372,13 +3099,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) /* Load primitive liveness */ tmp = ngg_gs_vertex_ptr(ctx, primidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* stream 0 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); @@ -3424,7 +3146,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) * there are 4 or more contiguous null primitives in the export * (in the common case of single-dword prim exports). */ - build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + vertlive_scan.result_reduce, num_emit_threads); /* Setup the reverse vertex compaction permutation. We re-use stream 1 * of the primitive liveness flags, relying on the fact that each @@ -3432,14 +3155,9 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) ac_build_ifcc(&ctx->ac, vertlive, 5130); { tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp2, tmp); + LLVMBuildStore(builder, tmp2, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); } ac_build_endif(&ctx->ac, 5130); @@ -3449,19 +3167,14 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); ac_build_ifcc(&ctx->ac, tmp, 5140); { - struct ngg_prim prim = {}; + LLVMValueRef flags; + struct ac_ngg_prim prim = {}; prim.num_vertices = verts_per_prim; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* primflag */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); - prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp, - LLVMConstInt(ctx->ac.i8, 0, false), ""); + flags = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); + prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, @@ -3469,7 +3182,25 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) prim.edgeflag[i] = ctx->ac.i1false; } - build_export_prim(ctx, &prim); + /* Geometry shaders output triangle strips, but NGG expects + * triangles. We need to change the vertex order for odd + * triangles to get correct front/back facing by swapping 2 + * vertex indices, but we also have to keep the provoking + * vertex in the same place. + */ + if (verts_per_prim == 3) { + LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); + is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); + + struct ac_ngg_prim in = prim; + prim.index[0] = in.index[0]; + prim.index[1] = LLVMBuildSelect(builder, is_odd, + in.index[2], in.index[1], ""); + prim.index[2] = LLVMBuildSelect(builder, is_odd, + in.index[1], in.index[2], ""); + } + + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 5140); @@ -3477,8 +3208,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, ""); ac_build_ifcc(&ctx->ac, tmp, 5145); { - struct radv_vs_output_info *outinfo = &ctx->shader_info->vs.outinfo; - bool export_view_index = ctx->options->key.has_multiview_view_index; + struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo; + bool export_view_index = ctx->args->options->key.has_multiview_view_index; struct radv_shader_output_values *outputs; unsigned noutput = 0; @@ -3491,21 +3222,15 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) outinfo->pos_exports = 0; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1: source data index */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); unsigned out_idx = 0; - gep_idx[1] = ctx->ac.i32_0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; int length = util_last_bit(output_usage_mask); if (!(ctx->output_mask & (1ull << i))) @@ -3519,8 +3244,7 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) if (!(output_usage_mask & (1 << j))) continue; - gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false); - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); tmp = LLVMBuildLoad(builder, tmp, ""); LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); @@ -3543,14 +3267,15 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) outputs[noutput].slot_name = VARYING_SLOT_LAYER; outputs[noutput].slot_index = 0; outputs[noutput].usage_mask = 0x1; - outputs[noutput].values[0] = ac_to_float(&ctx->ac, ctx->abi.view_index); + outputs[noutput].values[0] = + ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.view_index)); for (unsigned j = 1; j < 4; j++) outputs[noutput].values[j] = ctx->ac.f32_0; noutput++; } radv_llvm_export_vs(ctx, outputs, noutput, outinfo, - ctx->options->key.vs_common_out.export_clip_dists); + ctx->args->options->key.vs_common_out.export_clip_dists); FREE(outputs); } ac_build_endif(&ctx->ac, 5145); @@ -3558,34 +3283,20 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs) { LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; - const LLVMValueRef vertexidx = - LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, skip the write: excessive vertex emissions are not - * supposed to have any effect. - */ - const LLVMValueRef can_emit = - LLVMBuildICmp(builder, LLVMIntULT, vertexidx, - LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - ac_build_kill_if_false(&ctx->ac, can_emit); - - tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); - tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); - LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx); unsigned out_idx = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; uint8_t output_stream = - ctx->shader_info->gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; int length = util_last_bit(output_usage_mask); @@ -3599,20 +3310,21 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_0, /* first entry of struct */ - LLVMConstInt(ctx->ac.i32, out_idx, false), - }; - LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - out_val = ac_to_integer(&ctx->ac, out_val); out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); - LLVMBuildStore(builder, out_val, ptr); + LLVMBuildStore(builder, out_val, + ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); } } - assert(out_idx * 4 <= ctx->shader_info->gs.gsvs_vertex_size); + assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size); + + /* Store the current number of emitted vertices to zero out remaining + * primitive flags in case the geometry shader doesn't emit the maximum + * number of vertices. + */ + tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); /* Determine and store whether this vertex completed a primitive. */ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); @@ -3621,19 +3333,31 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); + /* Since the geometry shader emits triangle strips, we need to + * track which primitive is odd and swap vertex indices to get + * the correct vertex order. + */ + LLVMValueRef is_odd = ctx->ac.i1false; + if (stream == 0 && + si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) { + tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); + is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); + } + tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second struct entry */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - const LLVMValueRef primflagptr = - LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - + /* The per-vertex primitive flag encoding: + * bit 0: whether this vertex finishes a primitive + * bit 1: whether the primitive is odd (if we are emitting triangle strips) + */ tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp, primflagptr); + tmp = LLVMBuildOr(builder, tmp, + LLVMBuildShl(builder, + LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), + ctx->ac.i8_1, ""), ""); + LLVMBuildStore(builder, tmp, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); @@ -3644,15 +3368,16 @@ static void write_tess_factors(struct radv_shader_context *ctx) { unsigned stride, outer_comps, inner_comps; - LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5); - LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8); + LLVMValueRef tcs_rel_ids = ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids); + LLVMValueRef invocation_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 8, 5); + LLVMValueRef rel_patch_id = ac_unpack_param(&ctx->ac, tcs_rel_ids, 0, 8); unsigned tess_inner_index = 0, tess_outer_index; LLVMValueRef lds_base, lds_inner = NULL, lds_outer, byteoffset, buffer; LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4]; int i; ac_emit_barrier(&ctx->ac, ctx->stage); - switch (ctx->options->key.tcs.primitive_mode) { + switch (ctx->args->options->key.tcs.primitive_mode) { case GL_ISOLINES: stride = 2; outer_comps = 2; @@ -3694,7 +3419,7 @@ write_tess_factors(struct radv_shader_context *ctx) } // LINES reversal - if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) { + if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) { outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer); lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer, ctx->ac.i32_1, ""); @@ -3723,12 +3448,12 @@ write_tess_factors(struct radv_shader_context *ctx) buffer = ctx->hs_ring_tess_factor; - tf_base = ctx->tess_factor_offset; + tf_base = ac_get_arg(&ctx->ac, ctx->args->tess_factor_offset); byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id, LLVMConstInt(ctx->ac.i32, 4 * stride, false), ""); unsigned tf_offset = 0; - if (ctx->options->chip_class <= GFX8) { + if (ctx->ac.chip_class <= GFX8) { ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504); @@ -3737,7 +3462,7 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, false), 1, ctx->ac.i32_0, tf_base, - 0, ac_glc, false); + 0, ac_glc); tf_offset += 4; ac_build_endif(&ctx->ac, 6504); @@ -3746,14 +3471,14 @@ write_tess_factors(struct radv_shader_context *ctx) /* Store the tessellation factors. */ ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, - tf_offset, ac_glc, false); + tf_offset, ac_glc); if (vec1) ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, - 16 + tf_offset, ac_glc, false); + 16 + tf_offset, ac_glc); //store to offchip for TES to read - only if TES reads them - if (ctx->options->key.tcs.tes_reads_tess_factors) { + if (ctx->args->options->key.tcs.tes_reads_tess_factors) { LLVMValueRef inner_vec, outer_vec, tf_outer_offset; LLVMValueRef tf_inner_offset; unsigned param_outer, param_inner; @@ -3767,7 +3492,8 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec, outer_comps, tf_outer_offset, - ctx->oc_lds, 0, ac_glc, false); + ac_get_arg(&ctx->ac, ctx->args->oc_lds), + 0, ac_glc); if (inner_comps) { param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER); tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL, @@ -3777,7 +3503,8 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_gather_values(&ctx->ac, inner, inner_comps); ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec, inner_comps, tf_inner_offset, - ctx->oc_lds, 0, ac_glc, false); + ac_get_arg(&ctx->ac, ctx->args->oc_lds), + 0, ac_glc); } } @@ -3844,15 +3571,15 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) } /* Process depth, stencil, samplemask. */ - if (ctx->shader_info->ps.writes_z) { + if (ctx->args->shader_info->ps.writes_z) { depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0)); } - if (ctx->shader_info->ps.writes_stencil) { + if (ctx->args->shader_info->ps.writes_stencil) { stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0)); } - if (ctx->shader_info->ps.writes_sample_mask) { + if (ctx->args->shader_info->ps.writes_sample_mask) { samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0)); } @@ -3861,9 +3588,9 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) * exported. */ if (index > 0 && - !ctx->shader_info->ps.writes_z && - !ctx->shader_info->ps.writes_stencil && - !ctx->shader_info->ps.writes_sample_mask) { + !ctx->args->shader_info->ps.writes_z && + !ctx->args->shader_info->ps.writes_stencil && + !ctx->args->shader_info->ps.writes_sample_mask) { unsigned last = index - 1; color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */ @@ -3883,7 +3610,7 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) static void emit_gs_epilogue(struct radv_shader_context *ctx) { - if (ctx->options->key.vs_common_out.as_ngg) { + if (ctx->args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_epilogue_1(ctx); return; } @@ -3902,16 +3629,16 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, switch (ctx->stage) { case MESA_SHADER_VERTEX: - if (ctx->options->key.vs_common_out.as_ls) + if (ctx->args->options->key.vs_common_out.as_ls) handle_ls_outputs_post(ctx); - else if (ctx->options->key.vs_common_out.as_es) - handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info); - else if (ctx->options->key.vs_common_out.as_ngg) - break; /* handled outside of the shader body */ + else if (ctx->args->options->key.vs_common_out.as_es) + handle_es_outputs_post(ctx, &ctx->args->shader_info->vs.es_info); + else if (ctx->args->options->key.vs_common_out.as_ngg) + handle_ngg_outputs_post_1(ctx); else - handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, - ctx->options->key.vs_common_out.export_clip_dists, - &ctx->shader_info->vs.outinfo); + handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id, + ctx->args->options->key.vs_common_out.export_clip_dists, + &ctx->args->shader_info->vs.outinfo); break; case MESA_SHADER_FRAGMENT: handle_fs_outputs_post(ctx); @@ -3923,14 +3650,14 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, handle_tcs_outputs_post(ctx); break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.vs_common_out.as_es) - handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info); - else if (ctx->options->key.vs_common_out.as_ngg) - break; /* handled outside of the shader body */ + if (ctx->args->options->key.vs_common_out.as_es) + handle_es_outputs_post(ctx, &ctx->args->shader_info->tes.es_info); + else if (ctx->args->options->key.vs_common_out.as_ngg) + handle_ngg_outputs_post_1(ctx); else - handle_vs_outputs_post(ctx, ctx->options->key.vs_common_out.export_prim_id, - ctx->options->key.vs_common_out.export_clip_dists, - &ctx->shader_info->tes.outinfo); + handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id, + ctx->args->options->key.vs_common_out.export_clip_dists, + &ctx->args->shader_info->tes.outinfo); break; default: break; @@ -3959,15 +3686,15 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx) case MESA_SHADER_GEOMETRY: return; case MESA_SHADER_VERTEX: - if (ctx->options->key.vs_common_out.as_ls || - ctx->options->key.vs_common_out.as_es) + if (ctx->args->options->key.vs_common_out.as_ls || + ctx->args->options->key.vs_common_out.as_es) return; - outinfo = &ctx->shader_info->vs.outinfo; + outinfo = &ctx->args->shader_info->vs.outinfo; break; case MESA_SHADER_TESS_EVAL: - if (ctx->options->key.vs_common_out.as_es) + if (ctx->args->options->key.vs_common_out.as_es) return; - outinfo = &ctx->shader_info->tes.outinfo; + outinfo = &ctx->args->shader_info->tes.outinfo; break; default: unreachable("Unhandled shader type"); @@ -3976,16 +3703,16 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx) ac_optimize_vs_outputs(&ctx->ac, ctx->main_function, outinfo->vs_output_param_offset, - VARYING_SLOT_MAX, + VARYING_SLOT_MAX, 0, &outinfo->param_exports); } static void ac_setup_rings(struct radv_shader_context *ctx) { - if (ctx->options->chip_class <= GFX8 && + if (ctx->args->options->chip_class <= GFX8 && (ctx->stage == MESA_SHADER_GEOMETRY || - ctx->options->key.vs_common_out.as_es || ctx->options->key.vs_common_out.as_es)) { + ctx->args->options->key.vs_common_out.as_es || ctx->args->options->key.vs_common_out.as_es)) { unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS; LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false); @@ -3995,7 +3722,7 @@ ac_setup_rings(struct radv_shader_context *ctx) offset); } - if (ctx->is_gs_copy_shader) { + if (ctx->args->is_gs_copy_shader) { ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, @@ -4026,7 +3753,7 @@ ac_setup_rings(struct radv_shader_context *ctx) LLVMValueRef ring, tmp; num_components = - ctx->shader_info->gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; if (!num_components) continue; @@ -4081,28 +3808,49 @@ radv_nir_get_max_workgroup_size(enum chip_class chip_class, const struct nir_shader *nir) { const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1}; - return radv_get_max_workgroup_size(chip_class, stage, nir ? nir->info.cs.local_size : backup_sizes); + unsigned sizes[3]; + for (unsigned i = 0; i < 3; i++) + sizes[i] = nir ? nir->info.cs.local_size[i] : backup_sizes[i]; + return radv_get_max_workgroup_size(chip_class, stage, sizes); } /* Fixup the HW not emitting the TCS regs if there are no HS threads. */ static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx) { - LLVMValueRef count = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); + LLVMValueRef count = + ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8); LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, ""); - ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, ""); - ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, ""); - ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, ""); + ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->rel_auto_id), + ctx->abi.instance_id, ""); + ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_rel_ids), + ctx->rel_auto_id, + ""); + ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, + ac_get_arg(&ctx->ac, ctx->args->ac.tcs_patch_id), + ctx->abi.vertex_id, ""); } -static void prepare_gs_input_vgprs(struct radv_shader_context *ctx) +static void prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged) { - for(int i = 5; i >= 0; --i) { - ctx->gs_vtx_offset[i] = ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[i & ~1], - (i & 1) * 16, 16); - } + if (merged) { + for(int i = 5; i >= 0; --i) { + ctx->gs_vtx_offset[i] = + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i & ~1]), + (i & 1) * 16, 16); + } - ctx->gs_wave_id = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 16, 8); + ctx->gs_wave_id = ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), + 16, 8); + } else { + for (int i = 0; i < 6; i++) + ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[i]); + ctx->gs_wave_id = ac_get_arg(&ctx->ac, ctx->args->gs_wave_id); + } } /* Ensure that the esgs ring is declared. @@ -4129,68 +3877,88 @@ static LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders, int shader_count, - struct radv_shader_info *shader_info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_args *args) { struct radv_shader_context ctx = {0}; - unsigned i; - ctx.options = options; - ctx.shader_info = shader_info; - - enum ac_float_mode float_mode = - options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : - AC_FLOAT_MODE_DEFAULT; - - ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, options->wave_size, 64); - ctx.context = ctx.ac.context; + ctx.args = args; - radv_nir_shader_info_init(shader_info); + enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT; - for(int i = 0; i < shader_count; ++i) - radv_nir_shader_info_pass(shaders[i], options, shader_info); + if (args->shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) { + float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO; + } - for (i = 0; i < MAX_SETS; i++) - shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; - for (i = 0; i < AC_UD_MAX_UD; i++) - shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1; + ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, + args->options->family, float_mode, + args->shader_info->wave_size, + args->shader_info->ballot_bit_size); + ctx.context = ctx.ac.context; ctx.max_workgroup_size = 0; for (int i = 0; i < shader_count; ++i) { ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size, - radv_nir_get_max_workgroup_size(ctx.options->chip_class, + radv_nir_get_max_workgroup_size(args->options->chip_class, shaders[i]->info.stage, shaders[i])); } if (ctx.ac.chip_class >= GFX10) { if (is_pre_gs_stage(shaders[0]->info.stage) && - options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { ctx.max_workgroup_size = 128; } } - create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2, - shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX); + create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2); ctx.abi.inputs = &ctx.inputs[0]; ctx.abi.emit_outputs = handle_shader_outputs_post; - ctx.abi.emit_vertex = visit_emit_vertex; + ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter; ctx.abi.load_ubo = radv_load_ubo; ctx.abi.load_ssbo = radv_load_ssbo; ctx.abi.load_sampler_desc = radv_get_sampler_desc; ctx.abi.load_resource = radv_load_resource; ctx.abi.clamp_shadow_reference = false; - ctx.abi.robust_buffer_access = options->robust_buffer_access; + ctx.abi.robust_buffer_access = args->options->robust_buffer_access; - bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && ctx.options->key.vs_common_out.as_ngg; + bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg; if (shader_count >= 2 || is_ngg) ac_init_exec_full_mask(&ctx.ac); - if (options->has_ls_vgpr_init_bug && + if (args->ac.vertex_id.used) + ctx.abi.vertex_id = ac_get_arg(&ctx.ac, args->ac.vertex_id); + if (args->rel_auto_id.used) + ctx.rel_auto_id = ac_get_arg(&ctx.ac, args->rel_auto_id); + if (args->ac.instance_id.used) + ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id); + + if (args->options->has_ls_vgpr_init_bug && shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) ac_nir_fixup_ls_hs_input_vgprs(&ctx); + if (is_ngg) { + /* Declare scratch space base for streamout and vertex + * compaction. Whether space is actually allocated is + * determined during linking / PM4 creation. + * + * Add an extra dword per vertex to ensure an odd stride, which + * avoids bank conflicts for SoA accesses. + */ + if (!args->options->key.vs_common_out.as_ngg_passthrough) + declare_esgs_ring(&ctx); + + /* This is really only needed when streamout and / or vertex + * compaction is enabled. + */ + if (args->shader_info->so.num_outputs) { + LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8); + ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, + asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32)); + LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + } + } + for(int i = 0; i < shader_count; ++i) { ctx.stage = shaders[i]->info.stage; ctx.shader = shaders[i]; @@ -4201,7 +3969,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); } - if (ctx.options->key.vs_common_out.as_ngg) { + if (args->options->key.vs_common_out.as_ngg) { for (unsigned i = 0; i < 4; ++i) { ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); @@ -4209,18 +3977,21 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); } - /* TODO: streamout */ + unsigned scratch_size = 8; + if (args->shader_info->so.num_outputs) + scratch_size = 44; - LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); + LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size); ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32)); LLVMSetAlignment(ctx.gs_ngg_scratch, 4); - ctx.gs_ngg_emit = LLVMBuildIntToPtr(ctx.ac.builder, ctx.ac.i32_0, - LLVMPointerType(LLVMArrayType(ctx.ac.i32, 0), AC_ADDR_SPACE_LDS), - "ngg_emit"); + ctx.gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx.ac.module, + LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx.gs_ngg_emit, LLVMExternalLinkage); + LLVMSetAlignment(ctx.gs_ngg_emit, 4); } ctx.abi.load_inputs = load_gs_input; @@ -4230,26 +4001,36 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.abi.load_patch_vertices_in = load_patch_vertices_in; ctx.abi.store_tcs_outputs = store_tcs_output; if (shader_count == 1) - ctx.tcs_num_inputs = ctx.options->key.tcs.num_inputs; + ctx.tcs_num_inputs = args->options->key.tcs.num_inputs; else - ctx.tcs_num_inputs = util_last_bit64(shader_info->vs.ls_outputs_written); - ctx.tcs_num_patches = get_tcs_num_patches(&ctx); + ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written); + unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written); + unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written); + ctx.tcs_num_patches = + get_tcs_num_patches( + ctx.args->options->key.tcs.input_vertices, + ctx.shader->info.tess.tcs_vertices_out, + ctx.tcs_num_inputs, + tcs_num_outputs, + tcs_num_patch_outputs, + ctx.args->options->tess_offchip_block_dw_size, + ctx.args->options->chip_class, + ctx.args->options->family); } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) { ctx.abi.load_tess_varyings = load_tes_input; ctx.abi.load_tess_coord = load_tess_coord; ctx.abi.load_patch_vertices_in = load_patch_vertices_in; - ctx.tcs_num_patches = ctx.options->key.tes.num_patches; + ctx.tcs_num_patches = args->options->key.tes.num_patches; } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) { ctx.abi.load_base_vertex = radv_load_base_vertex; } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) { ctx.abi.load_sample_position = load_sample_position; ctx.abi.load_sample_mask_in = load_sample_mask_in; - ctx.abi.emit_kill = radv_emit_kill; } if (shaders[i]->info.stage == MESA_SHADER_VERTEX && - ctx.options->key.vs_common_out.as_ngg && - ctx.options->key.vs_common_out.export_prim_id) { + args->options->key.vs_common_out.as_ngg && + args->options->key.vs_common_out.export_prim_id) { declare_esgs_ring(&ctx); } @@ -4257,7 +4038,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, if (i) { if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && - ctx.options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_prologue(&ctx); nested_barrier = false; } else { @@ -4291,13 +4072,16 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_setup_rings(&ctx); - LLVMBasicBlockRef merge_block; + LLVMBasicBlockRef merge_block = NULL; if (shader_count >= 2 || is_ngg) { LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); - LLVMValueRef count = ac_unpack_param(&ctx.ac, ctx.merged_wave_info, 8 * i, 8); + LLVMValueRef count = + ac_unpack_param(&ctx.ac, + ac_get_arg(&ctx.ac, args->merged_wave_info), + 8 * i, 8); LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac); LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, ""); @@ -4310,10 +4094,10 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, prepare_interp_optimize(&ctx, shaders[i]); else if(shaders[i]->info.stage == MESA_SHADER_VERTEX) handle_vs_inputs(&ctx, shaders[i]); - else if(shader_count >= 2 && shaders[i]->info.stage == MESA_SHADER_GEOMETRY) - prepare_gs_input_vgprs(&ctx); + else if(shaders[i]->info.stage == MESA_SHADER_GEOMETRY) + prepare_gs_input_vgprs(&ctx, shader_count >= 2); - ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i]); + ac_nir_translate(&ctx.ac, &ctx.abi, &args->ac, shaders[i]); if (shader_count >= 2 || is_ngg) { LLVMBuildBr(ctx.ac.builder, merge_block); @@ -4323,37 +4107,46 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, /* This needs to be outside the if wrapping the shader body, as sometimes * the HW generates waves with 0 es/vs threads. */ if (is_pre_gs_stage(shaders[i]->info.stage) && - ctx.options->key.vs_common_out.as_ngg && + args->options->key.vs_common_out.as_ngg && i == shader_count - 1) { - handle_ngg_outputs_post(&ctx); + handle_ngg_outputs_post_2(&ctx); } else if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY && - ctx.options->key.vs_common_out.as_ngg) { + args->options->key.vs_common_out.as_ngg) { gfx10_ngg_gs_emit_epilogue_2(&ctx); } if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { - shader_info->tcs.num_patches = ctx.tcs_num_patches; - shader_info->tcs.lds_size = calculate_tess_lds_size(&ctx); + unsigned tcs_num_outputs = util_last_bit64(ctx.args->shader_info->tcs.outputs_written); + unsigned tcs_num_patch_outputs = util_last_bit64(ctx.args->shader_info->tcs.patch_outputs_written); + args->shader_info->tcs.num_patches = ctx.tcs_num_patches; + args->shader_info->tcs.lds_size = + calculate_tess_lds_size( + ctx.args->options->key.tcs.input_vertices, + ctx.shader->info.tess.tcs_vertices_out, + ctx.tcs_num_inputs, + ctx.tcs_num_patches, + tcs_num_outputs, + tcs_num_patch_outputs); } } LLVMBuildRetVoid(ctx.ac.builder); - if (options->dump_preoptir) { + if (args->options->dump_preoptir) { fprintf(stderr, "%s LLVM IR:\n\n", - radv_get_shader_name(shader_info, + radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage)); ac_dump_module(ctx.ac.module); fprintf(stderr, "\n"); } - ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); if (shader_count == 1) ac_nir_eliminate_const_vs_outputs(&ctx); - if (options->dump_shader) { - ctx.shader_info->private_mem_vgprs = + if (args->options->dump_shader) { + args->shader_info->private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_function); } @@ -4411,7 +4204,7 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, fprintf(stderr, "\n"); } - if (options->record_llvm_ir) { + if (options->record_ir) { char *llvm_ir = LLVMPrintModuleToString(llvm_module); llvm_ir_string = strdup(llvm_ir); LLVMDisposeMessage(llvm_ir); @@ -4444,48 +4237,49 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, free(elf_buffer); } -void +static void radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary, - struct radv_shader_info *shader_info, + const struct radv_shader_args *args, struct nir_shader *const *nir, - int nir_count, - const struct radv_nir_compiler_options *options) + int nir_count) { LLVMModuleRef llvm_module; - llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, shader_info, - options); + llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args); ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage, - radv_get_shader_name(shader_info, + radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage), - options); + args->options); /* Determine the ES type (VS or TES) for the GS on GFX9. */ - if (options->chip_class >= GFX9) { + if (args->options->chip_class >= GFX9) { if (nir_count == 2 && nir[1]->info.stage == MESA_SHADER_GEOMETRY) { - shader_info->gs.es_type = nir[0]->info.stage; + args->shader_info->gs.es_type = nir[0]->info.stage; } } - shader_info->wave_size = options->wave_size; } static void ac_gs_copy_shader_emit(struct radv_shader_context *ctx) { LLVMValueRef vtx_offset = - LLVMBuildMul(ctx->ac.builder, ctx->abi.vertex_id, + LLVMBuildMul(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id), LLVMConstInt(ctx->ac.i32, 4, false), ""); LLVMValueRef stream_id; /* Fetch the vertex stream ID. */ - if (ctx->shader_info->so.num_outputs) { + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs) { stream_id = - ac_unpack_param(&ctx->ac, ctx->streamout_config, 24, 2); + ac_unpack_param(&ctx->ac, + ac_get_arg(&ctx->ac, + ctx->args->streamout_config), + 24, 2); } else { stream_id = ctx->ac.i32_0; } @@ -4499,14 +4293,14 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) for (unsigned stream = 0; stream < 4; stream++) { unsigned num_components = - ctx->shader_info->gs.num_stream_output_components[stream]; + ctx->args->shader_info->gs.num_stream_output_components[stream]; LLVMBasicBlockRef bb; unsigned offset; - if (!num_components) + if (stream > 0 && !num_components) continue; - if (stream > 0 && !ctx->shader_info->so.num_outputs) + if (stream > 0 && !ctx->args->shader_info->so.num_outputs) continue; bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out"); @@ -4516,9 +4310,9 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) offset = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = - ctx->shader_info->gs.output_usage_mask[i]; + ctx->args->shader_info->gs.output_usage_mask[i]; unsigned output_stream = - ctx->shader_info->gs.output_streams[i]; + ctx->args->shader_info->gs.output_streams[i]; int length = util_last_bit(output_usage_mask); if (!(ctx->output_mask & (1ull << i)) || @@ -4554,12 +4348,13 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) } } - if (ctx->shader_info->so.num_outputs) + if (!ctx->args->options->use_ngg_streamout && + ctx->args->shader_info->so.num_outputs) radv_emit_streamout(ctx, stream); if (stream == 0) { handle_vs_outputs_post(ctx, false, true, - &ctx->shader_info->vs.outinfo); + &ctx->args->shader_info->vs.outinfo); } LLVMBuildBr(ctx->ac.builder, end_bb); @@ -4568,32 +4363,25 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb); } -void +static void radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, struct radv_shader_binary **rbinary, - struct radv_shader_info *shader_info, - const struct radv_nir_compiler_options *options) + const struct radv_shader_args *args) { struct radv_shader_context ctx = {0}; - ctx.options = options; - ctx.shader_info = shader_info; + ctx.args = args; - enum ac_float_mode float_mode = - options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : - AC_FLOAT_MODE_DEFAULT; + assert(args->is_gs_copy_shader); - ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, - options->family, float_mode, 64, 64); + ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, + args->options->family, AC_FLOAT_MODE_DEFAULT, 64, 64); ctx.context = ctx.ac.context; - ctx.is_gs_copy_shader = true; ctx.stage = MESA_SHADER_VERTEX; ctx.shader = geom_shader; - radv_nir_shader_info_pass(geom_shader, options, shader_info); - - create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX); + create_function(&ctx, MESA_SHADER_VERTEX, false); ac_setup_rings(&ctx); @@ -4607,10 +4395,43 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, LLVMBuildRetVoid(ctx.ac.builder); - ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options); ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, - MESA_SHADER_VERTEX, "GS Copy Shader", options); + MESA_SHADER_VERTEX, "GS Copy Shader", args->options); (*rbinary)->is_gs_copy_shader = true; } + +void +llvm_compile_shader(struct radv_device *device, + unsigned shader_count, + struct nir_shader *const *shaders, + struct radv_shader_binary **binary, + struct radv_shader_args *args) +{ + enum ac_target_machine_options tm_options = 0; + struct ac_llvm_compiler ac_llvm; + bool thread_compiler; + + tm_options |= AC_TM_SUPPORTS_SPILL; + if (args->options->check_ir) + tm_options |= AC_TM_CHECK_IR; + if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) + tm_options |= AC_TM_NO_LOAD_STORE_OPT; + + thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); + + radv_init_llvm_compiler(&ac_llvm, thread_compiler, + args->options->family, tm_options, + args->shader_info->wave_size); + + if (args->is_gs_copy_shader) { + radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args); + } else { + radv_compile_nir_shader(&ac_llvm, binary, args, + shaders, shader_count); + } + + radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); +}