X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_nir_to_llvm.c;h=755b7cb0246111a1647940d3e2006d08058aacee;hb=7832e75ea80e66aaa1254d5576f1162822543257;hp=9cd29e257fb743594bc578d89baa5e50628698b5;hpb=e1387eaf124b298d65707fce77e942af9622cbe1;p=mesa.git diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 9cd29e257fb..755b7cb0246 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -27,17 +27,15 @@ #include "radv_private.h" #include "radv_shader.h" +#include "radv_shader_helper.h" #include "nir/nir.h" #include #include #include -#if HAVE_LLVM >= 0x0700 #include -#endif #include "sid.h" -#include "gfx9d.h" #include "ac_binary.h" #include "ac_llvm_util.h" #include "ac_llvm_build.h" @@ -77,24 +75,29 @@ struct radv_shader_context { LLVMValueRef gs_vtx_offset[6]; LLVMValueRef esgs_ring; - LLVMValueRef gsvs_ring; + LLVMValueRef gsvs_ring[4]; LLVMValueRef hs_ring_tess_offchip; LLVMValueRef hs_ring_tess_factor; LLVMValueRef persp_sample, persp_center, persp_centroid; LLVMValueRef linear_sample, linear_center, linear_centroid; + /* Streamout */ + LLVMValueRef streamout_buffers; + LLVMValueRef streamout_write_idx; + LLVMValueRef streamout_config; + LLVMValueRef streamout_offset[4]; + gl_shader_stage stage; LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; + uint64_t float16_shaded_mask; uint64_t input_mask; uint64_t output_mask; - uint8_t num_output_clips; - uint8_t num_output_culls; bool is_gs_copy_shader; - LLVMValueRef gs_next_vertex; + LLVMValueRef gs_next_vertex[4]; unsigned gs_max_out_vertices; unsigned tes_primitive_mode; @@ -251,7 +254,16 @@ get_tcs_num_patches(struct radv_shader_context *ctx) /* Make sure that the data fits in LDS. This assumes the shaders only * use LDS for the inputs and outputs. */ - hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768; + hardware_lds_size = 32768; + + /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single + * threadgroup, even though there is more than 32 KiB LDS. + * + * Test: dEQP-VK.tessellation.shader_input_output.barrier + */ + if (ctx->options->chip_class >= GFX7 && ctx->options->family != CHIP_STONEY) + hardware_lds_size = 65536; + num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); /* Make sure the output data fits in the offchip buffer */ num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size); @@ -260,8 +272,8 @@ get_tcs_num_patches(struct radv_shader_context *ctx) */ num_patches = MIN2(num_patches, 40); - /* SI bug workaround - limit LS-HS threadgroups to only one wave. */ - if (ctx->options->chip_class == SI) { + /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ + if (ctx->options->chip_class == GFX6) { unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); num_patches = MIN2(num_patches, one_wave); } @@ -401,10 +413,8 @@ get_tcs_out_current_patch_offset(struct radv_shader_context *ctx) LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildAdd(ctx->ac.builder, patch0_offset, - LLVMBuildMul(ctx->ac.builder, patch_stride, - rel_patch_id, ""), - ""); + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, + patch0_offset); } static LLVMValueRef @@ -415,17 +425,14 @@ get_tcs_out_current_patch_data_offset(struct radv_shader_context *ctx) LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx); LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); - return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset, - LLVMBuildMul(ctx->ac.builder, patch_stride, - rel_patch_id, ""), - ""); + return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, + patch0_patch_data_offset); } -#define MAX_ARGS 23 +#define MAX_ARGS 64 struct arg_info { LLVMTypeRef types[MAX_ARGS]; LLVMValueRef *assign[MAX_ARGS]; - unsigned array_params_mask; uint8_t count; uint8_t sgpr_count; uint8_t num_sgprs_used; @@ -456,13 +463,6 @@ add_arg(struct arg_info *info, enum ac_arg_regfile regfile, LLVMTypeRef type, } } -static inline void -add_array_arg(struct arg_info *info, LLVMTypeRef type, LLVMValueRef *param_ptr) -{ - info->array_params_mask |= (1 << info->count); - add_arg(info, ARG_SGPR, type, param_ptr); -} - static void assign_arguments(LLVMValueRef main_function, struct arg_info *info) { @@ -501,10 +501,11 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS); for (unsigned i = 0; i < args->sgpr_count; ++i) { + LLVMValueRef P = LLVMGetParam(main_function, i); + ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG); - if (args->array_params_mask & (1 << i)) { - LLVMValueRef P = LLVMGetParam(main_function, i); + if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) { ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS); ac_add_attr_dereferenceable(P, UINT64_MAX); } @@ -516,11 +517,8 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, options->address32_hi); } - if (max_workgroup_size) { - ac_llvm_add_target_dep_function_attr(main_function, - "amdgpu-max-work-group-size", - max_workgroup_size); - } + ac_llvm_set_workgroup_size(main_function, max_workgroup_size); + if (options->unsafe_math) { /* These were copied from some LLVM test. */ LLVMAddTargetDependentFunctionAttr(main_function, @@ -544,13 +542,11 @@ create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module, static void -set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs, - uint32_t indirect_offset) +set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, + uint8_t num_sgprs) { ud_info->sgpr_idx = *sgpr_idx; ud_info->num_sgprs = num_sgprs; - ud_info->indirect = indirect_offset > 0; - ud_info->indirect_offset = indirect_offset; *sgpr_idx += num_sgprs; } @@ -562,32 +558,34 @@ set_loc_shader(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx, &ctx->shader_info->user_sgprs_locs.shader_data[idx]; assert(ud_info); - set_loc(ud_info, sgpr_idx, num_sgprs, 0); + set_loc(ud_info, sgpr_idx, num_sgprs); } static void set_loc_shader_ptr(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) { - bool use_32bit_pointers = HAVE_32BIT_POINTERS && - idx != AC_UD_SCRATCH_RING_OFFSETS; + bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS; set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2); } static void -set_loc_desc(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx, - uint32_t indirect_offset) +set_loc_desc(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx) { - struct radv_userdata_info *ud_info = - &ctx->shader_info->user_sgprs_locs.descriptor_sets[idx]; + struct radv_userdata_locations *locs = + &ctx->shader_info->user_sgprs_locs; + struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx]; assert(ud_info); - set_loc(ud_info, sgpr_idx, HAVE_32BIT_POINTERS ? 1 : 2, indirect_offset); + set_loc(ud_info, sgpr_idx, 1); + + locs->descriptor_sets_enabled |= 1 << idx; } struct user_sgpr_info { bool need_ring_offsets; bool indirect_all_descriptor_sets; + uint8_t remaining_sgprs; }; static bool needs_view_index_sgpr(struct radv_shader_context *ctx, @@ -620,12 +618,56 @@ count_vs_user_sgprs(struct radv_shader_context *ctx) uint8_t count = 0; if (ctx->shader_info->info.vs.has_vertex_buffers) - count += HAVE_32BIT_POINTERS ? 1 : 2; + count++; count += ctx->shader_info->info.vs.needs_draw_id ? 3 : 2; return count; } +static void allocate_inline_push_consts(struct radv_shader_context *ctx, + struct user_sgpr_info *user_sgpr_info) +{ + uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; + + /* Only supported if shaders use push constants. */ + if (ctx->shader_info->info.min_push_constant_used == UINT8_MAX) + return; + + /* Only supported if shaders don't have indirect push constants. */ + if (ctx->shader_info->info.has_indirect_push_constants) + return; + + /* Only supported for 32-bit push constants. */ + if (!ctx->shader_info->info.has_only_32bit_push_constants) + return; + + uint8_t num_push_consts = + (ctx->shader_info->info.max_push_constant_used - + ctx->shader_info->info.min_push_constant_used) / 4; + + /* Check if the number of user SGPRs is large enough. */ + if (num_push_consts < remaining_sgprs) { + ctx->shader_info->info.num_inline_push_consts = num_push_consts; + } else { + ctx->shader_info->info.num_inline_push_consts = remaining_sgprs; + } + + /* Clamp to the maximum number of allowed inlined push constants. */ + if (ctx->shader_info->info.num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) + ctx->shader_info->info.num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; + + if (ctx->shader_info->info.num_inline_push_consts == num_push_consts && + !ctx->shader_info->info.loads_dynamic_offsets) { + /* Disable the default push constants path if all constants are + * inlined and if shaders don't use dynamic descriptors. + */ + ctx->shader_info->info.loads_push_constants = false; + } + + ctx->shader_info->info.base_inline_push_consts = + ctx->shader_info->info.min_push_constant_used / 4; +} + static void allocate_user_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage, @@ -689,51 +731,64 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx, user_sgpr_count++; if (ctx->shader_info->info.loads_push_constants) - user_sgpr_count += HAVE_32BIT_POINTERS ? 1 : 2; + user_sgpr_count++; + + if (ctx->streamout_buffers) + user_sgpr_count++; - uint32_t available_sgprs = ctx->options->chip_class >= GFX9 ? 32 : 16; + uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16; uint32_t remaining_sgprs = available_sgprs - user_sgpr_count; uint32_t num_desc_set = util_bitcount(ctx->shader_info->info.desc_set_used_mask); - if (remaining_sgprs / (HAVE_32BIT_POINTERS ? 1 : 2) < num_desc_set) { + if (remaining_sgprs < num_desc_set) { user_sgpr_info->indirect_all_descriptor_sets = true; + user_sgpr_info->remaining_sgprs = remaining_sgprs - 1; + } else { + user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set; } + + allocate_inline_push_consts(ctx, user_sgpr_info); } static void declare_global_input_sgprs(struct radv_shader_context *ctx, - gl_shader_stage stage, - bool has_previous_stage, - gl_shader_stage previous_stage, const struct user_sgpr_info *user_sgpr_info, struct arg_info *args, LLVMValueRef *desc_sets) { LLVMTypeRef type = ac_array_in_const32_addr_space(ctx->ac.i8); - unsigned num_sets = ctx->options->layout ? - ctx->options->layout->num_sets : 0; - unsigned stage_mask = 1 << stage; - - if (has_previous_stage) - stage_mask |= 1 << previous_stage; /* 1 for each descriptor set */ if (!user_sgpr_info->indirect_all_descriptor_sets) { - for (unsigned i = 0; i < num_sets; ++i) { - if ((ctx->shader_info->info.desc_set_used_mask & (1 << i)) && - ctx->options->layout->set[i].layout->shader_stages & stage_mask) { - add_array_arg(args, type, - &ctx->descriptor_sets[i]); - } + uint32_t mask = ctx->shader_info->info.desc_set_used_mask; + + while (mask) { + int i = u_bit_scan(&mask); + + add_arg(args, ARG_SGPR, type, &ctx->descriptor_sets[i]); } } else { - add_array_arg(args, ac_array_in_const32_addr_space(type), desc_sets); + add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(type), + desc_sets); } if (ctx->shader_info->info.loads_push_constants) { /* 1 for push constants and dynamic descriptors */ - add_array_arg(args, type, &ctx->abi.push_constants); + add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants); + } + + for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; i++) { + add_arg(args, ARG_SGPR, ctx->ac.i32, + &ctx->abi.inline_push_consts[i]); + } + ctx->abi.num_inline_push_consts = ctx->shader_info->info.num_inline_push_consts; + ctx->abi.base_inline_push_consts = ctx->shader_info->info.base_inline_push_consts; + + if (ctx->shader_info->info.so.num_outputs) { + add_arg(args, ARG_SGPR, + ac_array_in_const32_addr_space(ctx->ac.v4i32), + &ctx->streamout_buffers); } } @@ -776,6 +831,36 @@ declare_vs_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) } } +static void +declare_streamout_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, + struct arg_info *args) +{ + int i; + + /* Streamout SGPRs. */ + if (ctx->shader_info->info.so.num_outputs) { + assert(stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_TESS_EVAL); + + if (stage != MESA_SHADER_TESS_EVAL) { + add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_config); + } else { + args->assign[args->count - 1] = &ctx->streamout_config; + args->types[args->count - 1] = ctx->ac.i32; + } + + add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_write_idx); + } + + /* A streamout buffer offset is loaded if the stride is non-zero. */ + for (i = 0; i < 4; i++) { + if (!ctx->shader_info->info.so.strides[i]) + continue; + + add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->streamout_offset[i]); + } +} + static void declare_tes_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) { @@ -786,48 +871,47 @@ declare_tes_input_vgprs(struct radv_shader_context *ctx, struct arg_info *args) } static void -set_global_input_locs(struct radv_shader_context *ctx, gl_shader_stage stage, - bool has_previous_stage, gl_shader_stage previous_stage, +set_global_input_locs(struct radv_shader_context *ctx, const struct user_sgpr_info *user_sgpr_info, LLVMValueRef desc_sets, uint8_t *user_sgpr_idx) { - unsigned num_sets = ctx->options->layout ? - ctx->options->layout->num_sets : 0; - unsigned stage_mask = 1 << stage; - - if (has_previous_stage) - stage_mask |= 1 << previous_stage; + uint32_t mask = ctx->shader_info->info.desc_set_used_mask; if (!user_sgpr_info->indirect_all_descriptor_sets) { - for (unsigned i = 0; i < num_sets; ++i) { - if ((ctx->shader_info->info.desc_set_used_mask & (1 << i)) && - ctx->options->layout->set[i].layout->shader_stages & stage_mask) { - set_loc_desc(ctx, i, user_sgpr_idx, 0); - } else - ctx->descriptor_sets[i] = NULL; + while (mask) { + int i = u_bit_scan(&mask); + + set_loc_desc(ctx, i, user_sgpr_idx); } } else { set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, user_sgpr_idx); - for (unsigned i = 0; i < num_sets; ++i) { - if ((ctx->shader_info->info.desc_set_used_mask & (1 << i)) && - ctx->options->layout->set[i].layout->shader_stages & stage_mask) { - set_loc_desc(ctx, i, user_sgpr_idx, i * 8); - ctx->descriptor_sets[i] = - ac_build_load_to_sgpr(&ctx->ac, - desc_sets, - LLVMConstInt(ctx->ac.i32, i, false)); + while (mask) { + int i = u_bit_scan(&mask); + + ctx->descriptor_sets[i] = + ac_build_load_to_sgpr(&ctx->ac, desc_sets, + LLVMConstInt(ctx->ac.i32, i, false)); - } else - ctx->descriptor_sets[i] = NULL; } + ctx->shader_info->need_indirect_descriptor_sets = true; } if (ctx->shader_info->info.loads_push_constants) { set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); } + + if (ctx->shader_info->info.num_inline_push_consts) { + set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, + ctx->shader_info->info.num_inline_push_consts); + } + + if (ctx->streamout_buffers) { + set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, + user_sgpr_idx); + } } static void @@ -902,9 +986,8 @@ static void create_function(struct radv_shader_context *ctx, switch (stage) { case MESA_SHADER_COMPUTE: - declare_global_input_sgprs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_info, - &args, &desc_sets); + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, + &desc_sets); if (ctx->shader_info->info.cs.uses_grid_size) { add_arg(&args, ARG_SGPR, ctx->ac.v3i32, @@ -925,18 +1008,23 @@ static void create_function(struct radv_shader_context *ctx, &ctx->abi.local_invocation_ids); break; case MESA_SHADER_VERTEX: - declare_global_input_sgprs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_info, - &args, &desc_sets); + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, + &desc_sets); + declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage, previous_stage, &args); if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.view_index); - if (ctx->options->key.vs.as_es) + if (ctx->options->key.vs.as_es) { add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->es2gs_offset); + } else if (ctx->options->key.vs.as_ls) { + /* no extra parameters */ + } else { + declare_streamout_sgprs(ctx, stage, &args); + } declare_vs_input_vgprs(ctx, &args); break; @@ -953,11 +1041,9 @@ static void create_function(struct radv_shader_context *ctx, add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - declare_global_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, - &user_sgpr_info, &args, + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, &desc_sets); + declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage, previous_stage, &args); @@ -973,10 +1059,7 @@ static void create_function(struct radv_shader_context *ctx, declare_vs_input_vgprs(ctx, &args); } else { - declare_global_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, - &user_sgpr_info, &args, + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, &desc_sets); if (needs_view_index) @@ -993,9 +1076,8 @@ static void create_function(struct radv_shader_context *ctx, } break; case MESA_SHADER_TESS_EVAL: - declare_global_input_sgprs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_info, - &args, &desc_sets); + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, + &desc_sets); if (needs_view_index) add_arg(&args, ARG_SGPR, ctx->ac.i32, @@ -1008,6 +1090,7 @@ static void create_function(struct radv_shader_context *ctx, &ctx->es2gs_offset); } else { add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); + declare_streamout_sgprs(ctx, stage, &args); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds); } declare_tes_input_vgprs(ctx, &args); @@ -1025,10 +1108,7 @@ static void create_function(struct radv_shader_context *ctx, add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown - declare_global_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, - &user_sgpr_info, &args, + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, &desc_sets); if (previous_stage != MESA_SHADER_TESS_EVAL) { @@ -1059,10 +1139,7 @@ static void create_function(struct radv_shader_context *ctx, declare_tes_input_vgprs(ctx, &args); } } else { - declare_global_input_sgprs(ctx, stage, - has_previous_stage, - previous_stage, - &user_sgpr_info, &args, + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, &desc_sets); if (needs_view_index) @@ -1090,9 +1167,8 @@ static void create_function(struct radv_shader_context *ctx, } break; case MESA_SHADER_FRAGMENT: - declare_global_input_sgprs(ctx, stage, has_previous_stage, - previous_stage, &user_sgpr_info, - &args, &desc_sets); + declare_global_input_sgprs(ctx, &user_sgpr_info, &args, + &desc_sets); add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask); add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample); @@ -1139,7 +1215,7 @@ static void create_function(struct radv_shader_context *ctx, &user_sgpr_idx); if (ctx->options->supports_spill) { ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", - LLVMPointerType(ctx->ac.i8, AC_CONST_ADDR_SPACE), + LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0, AC_FUNC_ATTR_READNONE); ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, ac_array_in_const_addr_space(ctx->ac.v4i32), ""); @@ -1151,8 +1227,7 @@ static void create_function(struct radv_shader_context *ctx, if (has_previous_stage) user_sgpr_idx = 0; - set_global_input_locs(ctx, stage, has_previous_stage, previous_stage, - &user_sgpr_info, desc_sets, &user_sgpr_idx); + set_global_input_locs(ctx, &user_sgpr_info, desc_sets, &user_sgpr_idx); switch (stage) { case MESA_SHADER_COMPUTE: @@ -1227,13 +1302,34 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false); offset = LLVMConstInt(ctx->ac.i32, base_offset, false); - index = LLVMBuildMul(ctx->ac.builder, index, stride, ""); - offset = LLVMBuildAdd(ctx->ac.builder, offset, index, ""); - desc_ptr = ac_build_gep0(&ctx->ac, desc_ptr, offset); + if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + offset = ac_build_imad(&ctx->ac, index, stride, offset); + } + + desc_ptr = LLVMBuildGEP(ctx->ac.builder, desc_ptr, &offset, 1, ""); desc_ptr = ac_cast_ptr(&ctx->ac, desc_ptr, ctx->ac.v4i32); LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + LLVMValueRef desc_components[4] = { + LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""), + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false), + /* High limit to support variable sizes. */ + LLVMConstInt(ctx->ac.i32, 0xffffffff, false), + LLVMConstInt(ctx->ac.i32, desc_type, false), + }; + + return ac_build_gather_values(&ctx->ac, desc_components, 4); + } + return desc_ptr; } @@ -1293,11 +1389,8 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx, constant16 = LLVMConstInt(ctx->ac.i32, 16, false); param_stride = calc_param_stride(ctx, vertex_index); if (vertex_index) { - base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id, - vertices_per_patch, ""); - - base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr, - vertex_index, ""); + base_addr = ac_build_imad(&ctx->ac, rel_patch_id, + vertices_per_patch, vertex_index); } else { base_addr = rel_patch_id; } @@ -1429,7 +1522,7 @@ store_tcs_output(struct ac_shader_abi *abi, { struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); const unsigned location = var->data.location; - const unsigned component = var->data.location_frac; + unsigned component = var->data.location_frac; const bool is_patch = var->data.patch; const bool is_compact = var->data.compact; LLVMValueRef dw_addr; @@ -1447,10 +1540,14 @@ store_tcs_output(struct ac_shader_abi *abi, } param = shader_io_get_unique_index(location); - if (location == VARYING_SLOT_CLIP_DIST0 && - is_compact && const_index > 3) { - const_index -= 3; - param++; + if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) { + const_index += component; + component = 0; + + if (const_index >= 4) { + const_index -= 4; + param++; + } } if (!is_patch) { @@ -1475,6 +1572,8 @@ store_tcs_output(struct ac_shader_abi *abi, if (!(writemask & (1 << chan))) continue; LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); + value = ac_to_integer(&ctx->ac, value); + value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); if (store_lds || is_tess_factor) { LLVMValueRef dw_addr_chan = @@ -1515,9 +1614,13 @@ load_tes_input(struct ac_shader_abi *abi, LLVMValueRef result; unsigned param = shader_io_get_unique_index(location); - if (location == VARYING_SLOT_CLIP_DIST0 && is_compact && const_index > 3) { - const_index -= 3; - param++; + if ((location == VARYING_SLOT_CLIP_DIST0 || location == VARYING_SLOT_CLIP_DIST1) && is_compact) { + const_index += component; + component = 0; + if (const_index >= 4) { + const_index -= 4; + param++; + } } buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, @@ -1571,10 +1674,13 @@ load_gs_input(struct ac_shader_abi *abi, ctx->ac.i32_0, vtx_offset, soffset, 0, 1, 0, true, false); + } - value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], - type, ""); + if (ac_get_type_size(type) == 2) { + value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], ctx->ac.i32, ""); + value[i] = LLVMBuildTrunc(ctx->ac.builder, value[i], ctx->ac.i16, ""); } + value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, ""); } result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component); result = ac_to_integer(&ctx->ac, result); @@ -1633,9 +1739,6 @@ radv_get_sample_pos_offset(uint32_t num_samples) case 8: sample_pos_offset = 7; break; - case 16: - sample_pos_offset = 15; - break; default: break; } @@ -1648,7 +1751,8 @@ static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef result; - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false)); + LLVMValueRef index = LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false); + LLVMValueRef ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ring_offsets, &index, 1, ""); ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), ""); @@ -1703,14 +1807,12 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr { LLVMValueRef gs_next_vertex; LLVMValueRef can_emit; - int idx; + unsigned offset = 0; struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - assert(stream == 0); - /* Write vertex attribute values to GSVS ring */ gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, - ctx->gs_next_vertex, + ctx->gs_next_vertex[stream], ""); /* If this thread has already emitted the declared maximum number of @@ -1722,52 +1824,51 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), ""); ac_build_kill_if_false(&ctx->ac, can_emit); - /* loop num outputs */ - idx = 0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = ctx->shader_info->info.gs.output_usage_mask[i]; + uint8_t output_stream = + ctx->shader_info->info.gs.output_streams[i]; LLVMValueRef *out_ptr = &addrs[i * 4]; - int length = 4; - int slot = idx; - int slot_inc = 1; + int length = util_last_bit(output_usage_mask); - if (!(ctx->output_mask & (1ull << i))) + if (!(ctx->output_mask & (1ull << i)) || + output_stream != stream) continue; - if (i == VARYING_SLOT_CLIP_DIST0) { - /* pack clip and cull into a single set of slots */ - length = ctx->num_output_clips + ctx->num_output_culls; - if (length > 4) - slot_inc = 2; - output_usage_mask = (1 << length) - 1; - } - for (unsigned j = 0; j < length; j++) { if (!(output_usage_mask & (1 << j))) continue; LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, (slot * 4 + j) * ctx->gs_max_out_vertices, false); + LLVMValueRef voffset = + LLVMConstInt(ctx->ac.i32, offset * + ctx->gs_max_out_vertices, false); + + offset++; + voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), ""); - out_val = LLVMBuildBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); + out_val = ac_to_integer(&ctx->ac, out_val); + out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); - ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring, + ac_build_buffer_store_dword(&ctx->ac, + ctx->gsvs_ring[stream], out_val, 1, voffset, ctx->gs2vs_offset, 0, 1, 1, true, true); } - idx += slot_inc; } gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, ""); - LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex); + LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id); + ac_build_sendmsg(&ctx->ac, + AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), + ctx->gs_wave_id); } static void @@ -1828,6 +1929,11 @@ static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef result; + if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr)) != LLVMPointerTypeKind) { + /* Do not load the descriptor for inlined uniform blocks. */ + return buffer_ptr; + } + LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, ""); @@ -1869,8 +1975,9 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, break; case AC_DESC_SAMPLER: type = ctx->ac.v4i32; - if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) - offset += 64; + if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { + offset += radv_combined_image_descriptor_sampler_offset(binding); + } type_size = 16; break; @@ -1878,6 +1985,13 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, type = ctx->ac.v4i32; type_size = 16; break; + case AC_DESC_PLANE_0: + case AC_DESC_PLANE_1: + case AC_DESC_PLANE_2: + type = ctx->ac.v8i32; + type_size = 32; + offset += 32 * (desc_type - AC_DESC_PLANE_0); + break; default: unreachable("invalid desc_type\n"); } @@ -1902,16 +2016,35 @@ static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi, assert(stride % type_size == 0); - if (!index) - index = ctx->ac.i32_0; + LLVMValueRef adjusted_index = index; + if (!adjusted_index) + adjusted_index = ctx->ac.i32_0; - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), ""); + adjusted_index = LLVMBuildMul(builder, adjusted_index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), ""); - list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->ac.i32, offset, 0)); + LLVMValueRef val_offset = LLVMConstInt(ctx->ac.i32, offset, 0); + list = LLVMBuildGEP(builder, list, &val_offset, 1, ""); list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(type), ""); - return ac_build_load_to_sgpr(&ctx->ac, list, index); + LLVMValueRef descriptor = ac_build_load_to_sgpr(&ctx->ac, list, adjusted_index); + + /* 3 plane formats always have same size and format for plane 1 & 2, so + * use the tail from plane 1 so that we can store only the first 16 bytes + * of the last plane. */ + if (desc_type == AC_DESC_PLANE_2) { + LLVMValueRef descriptor2 = radv_get_sampler_desc(abi, descriptor_set, base_index, constant_index, index, AC_DESC_PLANE_1,image, write, bindless); + + LLVMValueRef components[8]; + for (unsigned i = 0; i < 4; ++i) + components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor, i); + + for (unsigned i = 4; i < 8; ++i) + components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i); + descriptor = ac_build_gather_values(&ctx->ac, components, 8); + } + + return descriptor; } /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. @@ -1926,6 +2059,8 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); + alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, ""); + if (adjustment == RADV_ALPHA_ADJUST_SSCALED) alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, ""); else @@ -1953,7 +2088,71 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, ""); } - return alpha; + return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, ""); +} + +static unsigned +get_num_channels_from_data_format(unsigned data_format) +{ + switch (data_format) { + case V_008F0C_BUF_DATA_FORMAT_8: + case V_008F0C_BUF_DATA_FORMAT_16: + case V_008F0C_BUF_DATA_FORMAT_32: + return 1; + case V_008F0C_BUF_DATA_FORMAT_8_8: + case V_008F0C_BUF_DATA_FORMAT_16_16: + case V_008F0C_BUF_DATA_FORMAT_32_32: + return 2; + case V_008F0C_BUF_DATA_FORMAT_10_11_11: + case V_008F0C_BUF_DATA_FORMAT_11_11_10: + case V_008F0C_BUF_DATA_FORMAT_32_32_32: + return 3; + case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: + case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: + case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: + case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: + case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: + return 4; + default: + break; + } + + return 4; +} + +static LLVMValueRef +radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, + LLVMValueRef value, + unsigned num_channels, + bool is_float) +{ + LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0; + LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1; + LLVMValueRef chan[4]; + + if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) { + unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value)); + + if (num_channels == 4 && num_channels == vec_size) + return value; + + num_channels = MIN2(num_channels, vec_size); + + for (unsigned i = 0; i < num_channels; i++) + chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i); + } else { + if (num_channels) { + assert(num_channels == 1); + chan[0] = value; + } + } + + for (unsigned i = num_channels; i < 4; i++) { + chan[i] = i == 3 ? one : zero; + chan[i] = ac_to_integer(&ctx->ac, chan[i]); + } + + return ac_build_gather_values(&ctx->ac, chan, 4); } static void @@ -1968,20 +2167,25 @@ handle_vs_input_decl(struct radv_shader_context *ctx, unsigned attrib_count = glsl_count_attribute_slots(variable->type, true); uint8_t input_usage_mask = ctx->shader_info->info.vs.input_usage_mask[variable->data.location]; - unsigned num_channels = util_last_bit(input_usage_mask); + unsigned num_input_channels = util_last_bit(input_usage_mask); variable->data.driver_location = variable->data.location * 4; + enum glsl_base_type type = glsl_get_base_type(variable->type); for (unsigned i = 0; i < attrib_count; ++i) { LLVMValueRef output[4]; unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0; + unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index]; + unsigned data_format = attrib_format & 0x0f; + unsigned num_format = (attrib_format >> 4) & 0x07; + bool is_float = num_format != V_008F0C_BUF_NUM_FORMAT_UINT && + num_format != V_008F0C_BUF_NUM_FORMAT_SINT; if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) { uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index]; if (divisor) { - buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.instance_id, - ctx->abi.start_instance, ""); + buffer_index = ctx->abi.instance_id; if (divisor != 1) { buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index, @@ -1998,31 +2202,82 @@ handle_vs_input_decl(struct radv_shader_context *ctx, } else { buffer_index = ctx->ac.i32_0; } + + buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, ""); } else buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, ctx->abi.base_vertex, ""); - t_offset = LLVMConstInt(ctx->ac.i32, attrib_index, false); - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + /* Adjust the number of channels to load based on the vertex + * attribute format. + */ + unsigned num_format_channels = get_num_channels_from_data_format(data_format); + unsigned num_channels = MIN2(num_input_channels, num_format_channels); + unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index]; + unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index]; + unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index]; + + if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + /* Always load, at least, 3 channels for formats that + * need to be shuffled because X<->Z. + */ + num_channels = MAX2(num_channels, 3); + } + + if (attrib_stride != 0 && attrib_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + attrib_offset / attrib_stride, false); - input = ac_build_buffer_load_format(&ctx->ac, t_list, + buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, - ctx->ac.i32_0, - num_channels, false, true); + buffer_offset, ""); + + attrib_offset = attrib_offset % attrib_stride; + } - input = ac_build_expand_to_vec4(&ctx->ac, input, num_channels); + t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); + t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + + input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + buffer_index, + LLVMConstInt(ctx->ac.i32, attrib_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, + num_channels, + data_format, num_format, + false, false, true); + + if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) { + LLVMValueRef c[4]; + c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2); + c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1); + c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0); + c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3); + + input = ac_build_gather_values(&ctx->ac, c, 4); + } + + input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, + is_float); for (unsigned chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, ""); + if (type == GLSL_TYPE_FLOAT16) { + output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, ""); + output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, ""); + } } unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3; output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]); for (unsigned chan = 0; chan < 4; chan++) { - ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = - ac_to_integer(&ctx->ac, output[chan]); + output[chan] = ac_to_integer(&ctx->ac, output[chan]); + if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16) + output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, ""); + + ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan]; } } } @@ -2031,12 +2286,13 @@ static void interp_fs_input(struct radv_shader_context *ctx, unsigned attr, LLVMValueRef interp_param, LLVMValueRef prim_mask, + bool float16, LLVMValueRef result[4]) { LLVMValueRef attr_number; unsigned chan; LLVMValueRef i, j; - bool interp = interp_param != NULL; + bool interp = !LLVMIsUndef(interp_param); attr_number = LLVMConstInt(ctx->ac.i32, attr, false); @@ -2063,7 +2319,12 @@ static void interp_fs_input(struct radv_shader_context *ctx, for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); - if (interp) { + if (interp && float16) { + result[chan] = ac_build_fs_interp_f16(&ctx->ac, + llvm_chan, + attr_number, + prim_mask, i, j); + } else if (interp) { result[chan] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, @@ -2074,6 +2335,31 @@ static void interp_fs_input(struct radv_shader_context *ctx, llvm_chan, attr_number, prim_mask); + result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, ""); + result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], float16 ? ctx->ac.i16 : ctx->ac.i32, ""); + } + } +} + +static void mark_16bit_fs_input(struct radv_shader_context *ctx, + const struct glsl_type *type, + int location) +{ + if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) { + unsigned attrib_count = glsl_count_attribute_slots(type, false); + if (glsl_type_is_16bit(type)) { + ctx->float16_shaded_mask |= ((1ull << attrib_count) - 1) << location; + } + } else if (glsl_type_is_array(type)) { + unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false); + for (unsigned i = 0; i < glsl_get_length(type); ++i) { + mark_16bit_fs_input(ctx, glsl_get_array_element(type), location + i * stride); + } + } else { + assert(glsl_type_is_struct_or_ifc(type)); + for (unsigned i = 0; i < glsl_get_length(type); i++) { + mark_16bit_fs_input(ctx, glsl_get_struct_field(type, i), location); + location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false); } } } @@ -2084,12 +2370,24 @@ handle_fs_input_decl(struct radv_shader_context *ctx, { int idx = variable->data.location; unsigned attrib_count = glsl_count_attribute_slots(variable->type, false); - LLVMValueRef interp; + LLVMValueRef interp = NULL; + uint64_t mask; variable->data.driver_location = idx * 4; - ctx->input_mask |= ((1ull << attrib_count) - 1) << variable->data.location; - if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) { + + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + + glsl_get_length(variable->type); + attrib_count = (component_count + 3) / 4; + } else + mark_16bit_fs_input(ctx, variable->type, idx); + + mask = ((1ull << attrib_count) - 1) << variable->data.location; + + if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT || + glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT16 || + glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_STRUCT) { unsigned interp_type; if (variable->data.sample) interp_type = INTERP_SAMPLE; @@ -2099,12 +2397,14 @@ handle_fs_input_decl(struct radv_shader_context *ctx, interp_type = INTERP_CENTER; interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type); - } else - interp = NULL; + } + if (interp == NULL) + interp = LLVMGetUndef(ctx->ac.i32); for (unsigned i = 0; i < attrib_count; ++i) ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp; + ctx->input_mask |= mask; } static void @@ -2150,8 +2450,10 @@ handle_fs_inputs(struct radv_shader_context *ctx, unsigned index = 0; if (ctx->shader_info->info.ps.uses_input_attachments || - ctx->shader_info->info.needs_multiview_view_index) + ctx->shader_info->info.needs_multiview_view_index) { ctx->input_mask |= 1ull << VARYING_SLOT_LAYER; + ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = LLVMGetUndef(ctx->ac.i32); + } for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) { LLVMValueRef interp_param; @@ -2163,12 +2465,28 @@ handle_fs_inputs(struct radv_shader_context *ctx, if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC || i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) { interp_param = *inputs; - interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, + bool float16 = (ctx->float16_shaded_mask >> i) & 1; + interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, float16, inputs); - if (!interp_param) + if (LLVMIsUndef(interp_param)) ctx->shader_info->fs.flat_shaded_mask |= 1u << index; + if (float16) + ctx->shader_info->fs.float16_shaded_mask |= 1u << index; + if (i >= VARYING_SLOT_VAR0) + ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index; ++index; + } else if (i == VARYING_SLOT_CLIP_DIST0) { + int length = ctx->shader_info->info.ps.num_input_clips_culls; + + for (unsigned j = 0; j < length; j += 4) { + inputs = ctx->inputs + ac_llvm_reg_index_soa(i, j); + + interp_param = *inputs; + interp_fs_input(ctx, index, interp_param, + ctx->abi.prim_mask, false, inputs); + ++index; + } } else if (i == VARYING_SLOT_POS) { for(int i = 0; i < 3; ++i) inputs[i] = ctx->abi.frag_pos[i]; @@ -2200,27 +2518,27 @@ scan_shader_output_decl(struct radv_shader_context *ctx, if (stage == MESA_SHADER_TESS_CTRL) return; + if (variable->data.compact) { + unsigned component_count = variable->data.location_frac + + glsl_get_length(variable->type); + attrib_count = (component_count + 3) / 4; + } + mask_attribs = ((1ull << attrib_count) - 1) << idx; if (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_GEOMETRY) { if (idx == VARYING_SLOT_CLIP_DIST0) { - int length = shader->info.clip_distance_array_size + - shader->info.cull_distance_array_size; if (stage == MESA_SHADER_VERTEX) { ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1; ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; + ctx->shader_info->vs.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; } if (stage == MESA_SHADER_TESS_EVAL) { ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1; ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1; + ctx->shader_info->tes.outinfo.cull_dist_mask <<= shader->info.clip_distance_array_size; } - - if (length > 4) - attrib_count = 2; - else - attrib_count = 1; - mask_attribs = 1ull << idx; } } @@ -2254,7 +2572,11 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, args->out[2] = LLVMGetUndef(ctx->ac.f32); args->out[3] = LLVMGetUndef(ctx->ac.f32); - if (ctx->stage == MESA_SHADER_FRAGMENT && target >= V_008DFC_SQ_EXP_MRT) { + if (!values) + return; + + bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; + if (ctx->stage == MESA_SHADER_FRAGMENT) { unsigned index = target - V_008DFC_SQ_EXP_MRT; unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; @@ -2291,6 +2613,12 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, case V_028714_SPI_SHADER_FP16_ABGR: args->enabled_channels = 0x5; packf = ac_build_cvt_pkrtz_f16; + if (is_16bit) { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildFPExt(ctx->ac.builder, + values[chan], + ctx->ac.f32, ""); + } break; case V_028714_SPI_SHADER_UNORM16_ABGR: @@ -2306,11 +2634,23 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, case V_028714_SPI_SHADER_UINT16_ABGR: args->enabled_channels = 0x5; packi = ac_build_cvt_pk_u16; + if (is_16bit) { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildZExt(ctx->ac.builder, + ac_to_integer(&ctx->ac, values[chan]), + ctx->ac.i32, ""); + } break; case V_028714_SPI_SHADER_SINT16_ABGR: args->enabled_channels = 0x5; packi = ac_build_cvt_pk_i16; + if (is_16bit) { + for (unsigned chan = 0; chan < 4; chan++) + values[chan] = LLVMBuildSExt(ctx->ac.builder, + ac_to_integer(&ctx->ac, values[chan]), + ctx->ac.i32, ""); + } break; default: @@ -2353,14 +2693,16 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, return; } - memcpy(&args->out[0], values, sizeof(values[0]) * 4); - - for (unsigned i = 0; i < 4; ++i) { - if (!(args->enabled_channels & (1 << i))) - continue; + if (is_16bit) { + for (unsigned chan = 0; chan < 4; chan++) { + values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, ""); + args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, ""); + } + } else + memcpy(&args->out[0], values, sizeof(values[0]) * 4); + for (unsigned i = 0; i < 4; ++i) args->out[i] = ac_to_float(&ctx->ac, args->out[i]); - } } static void @@ -2383,6 +2725,139 @@ radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan) return LLVMBuildLoad(ctx->ac.builder, output, ""); } +static void +radv_emit_stream_output(struct radv_shader_context *ctx, + LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + const struct radv_stream_output *output) +{ + unsigned num_comps = util_bitcount(output->component_mask); + unsigned loc = output->location; + unsigned buf = output->buffer; + unsigned offset = output->offset; + unsigned start; + LLVMValueRef out[4]; + + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + return; + + /* Get the first component. */ + start = ffs(output->component_mask) - 1; + + /* Load the output as int. */ + for (int i = 0; i < num_comps; i++) { + out[i] = ac_to_integer(&ctx->ac, + radv_load_output(ctx, loc, start + i)); + } + + /* Pack the output. */ + LLVMValueRef vdata = NULL; + + switch (num_comps) { + case 1: /* as i32 */ + vdata = out[0]; + break; + case 2: /* as v2i32 */ + case 3: /* as v4i32 (aligned to 4) */ + out[3] = LLVMGetUndef(ctx->ac.i32); + /* fall through */ + case 4: /* as v4i32 */ + vdata = ac_build_gather_values(&ctx->ac, out, + !ac_has_vec3_support(ctx->ac.chip_class, false) ? + util_next_power_of_two(num_comps) : + num_comps); + break; + } + + ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], + vdata, num_comps, so_write_offsets[buf], + ctx->ac.i32_0, offset, + 1, 1, true, false); +} + +static void +radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream) +{ + struct ac_build_if_state if_ctx; + int i; + + /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ + assert(ctx->streamout_config); + LLVMValueRef so_vtx_count = + ac_build_bfe(&ctx->ac, ctx->streamout_config, + LLVMConstInt(ctx->ac.i32, 16, false), + LLVMConstInt(ctx->ac.i32, 7, false), false); + + LLVMValueRef tid = ac_get_thread_id(&ctx->ac); + + /* can_emit = tid < so_vtx_count; */ + LLVMValueRef can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + tid, so_vtx_count, ""); + + /* Emit the streamout code conditionally. This actually avoids + * out-of-bounds buffer access. The hw tells us via the SGPR + * (so_vtx_count) which threads are allowed to emit streamout data. + */ + ac_nir_build_if(&if_ctx, ctx, can_emit); + { + /* The buffer offset is computed as follows: + * ByteOffset = streamout_offset[buffer_id]*4 + + * (streamout_write_index + thread_id)*stride[buffer_id] + + * attrib_offset + */ + LLVMValueRef so_write_index = ctx->streamout_write_idx; + + /* Compute (streamout_write_index + thread_id). */ + so_write_index = + LLVMBuildAdd(ctx->ac.builder, so_write_index, tid, ""); + + /* Load the descriptor and compute the write offset for each + * enabled buffer. + */ + LLVMValueRef so_write_offset[4] = {}; + LLVMValueRef so_buffers[4] = {}; + LLVMValueRef buf_ptr = ctx->streamout_buffers; + + for (i = 0; i < 4; i++) { + uint16_t stride = ctx->shader_info->info.so.strides[i]; + + if (!stride) + continue; + + LLVMValueRef offset = + LLVMConstInt(ctx->ac.i32, i, false); + + so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, + buf_ptr, offset); + + LLVMValueRef so_offset = ctx->streamout_offset[i]; + + so_offset = LLVMBuildMul(ctx->ac.builder, so_offset, + LLVMConstInt(ctx->ac.i32, 4, false), ""); + + so_write_offset[i] = + ac_build_imad(&ctx->ac, so_write_index, + LLVMConstInt(ctx->ac.i32, + stride * 4, false), + so_offset); + } + + /* Write streamout data. */ + for (i = 0; i < ctx->shader_info->info.so.num_outputs; i++) { + struct radv_stream_output *output = + &ctx->shader_info->info.so.outputs[i]; + + if (stream != output->stream) + continue; + + radv_emit_stream_output(ctx, so_buffers, + so_write_offset, output); + } + } + ac_nir_build_endif(&if_ctx); +} + static void handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, bool export_layer_id, @@ -2410,32 +2885,42 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, sizeof(outinfo->vs_output_param_offset)); - if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) { - LLVMValueRef slots[8]; - unsigned j; + for(unsigned location = VARYING_SLOT_CLIP_DIST0; location <= VARYING_SLOT_CLIP_DIST1; ++location) { + if (ctx->output_mask & (1ull << location)) { + unsigned output_usage_mask, length; + LLVMValueRef slots[4]; + unsigned j; + + if (ctx->stage == MESA_SHADER_VERTEX && + !ctx->is_gs_copy_shader) { + output_usage_mask = + ctx->shader_info->info.vs.output_usage_mask[location]; + } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { + output_usage_mask = + ctx->shader_info->info.tes.output_usage_mask[location]; + } else { + assert(ctx->is_gs_copy_shader); + output_usage_mask = + ctx->shader_info->info.gs.output_usage_mask[location]; + } - if (outinfo->cull_dist_mask) - outinfo->cull_dist_mask <<= ctx->num_output_clips; + length = util_last_bit(output_usage_mask); - i = VARYING_SLOT_CLIP_DIST0; - for (j = 0; j < ctx->num_output_clips + ctx->num_output_culls; j++) - slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); + for (j = 0; j < length; j++) + slots[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, location, j)); - for (i = ctx->num_output_clips + ctx->num_output_culls; i < 8; i++) - slots[i] = LLVMGetUndef(ctx->ac.f32); + for (i = length; i < 4; i++) + slots[i] = LLVMGetUndef(ctx->ac.f32); - if (ctx->num_output_clips + ctx->num_output_culls > 4) { - target = V_008DFC_SQ_EXP_POS + 3; - si_llvm_init_export_args(ctx, &slots[4], 0xf, target, &args); + target = V_008DFC_SQ_EXP_POS + 2 + (location - VARYING_SLOT_CLIP_DIST0); + si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args); memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS], - &args, sizeof(args)); - } - - target = V_008DFC_SQ_EXP_POS + 2; - si_llvm_init_export_args(ctx, &slots[0], 0xf, target, &args); - memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS], - &args, sizeof(args)); + &args, sizeof(args)); + /* Export the clip/cull distances values to the next stage. */ + radv_export_param(ctx, param_count, &slots[0], 0xf); + outinfo->vs_output_param_offset[location] = param_count++; + } } LLVMValueRef pos_values[4] = {ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_1}; @@ -2460,6 +2945,12 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, viewport_index_value = radv_load_output(ctx, VARYING_SLOT_VIEWPORT, 0); } + if (ctx->shader_info->info.so.num_outputs && + !ctx->is_gs_copy_shader) { + /* The GS copy shader emission already emits streamout. */ + radv_emit_streamout(ctx, 0); + } + if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) { @@ -2590,17 +3081,13 @@ handle_es_outputs_post(struct radv_shader_context *ctx, for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { int param_index; - int length = 4; if (!(ctx->output_mask & (1ull << i))) continue; - if (i == VARYING_SLOT_CLIP_DIST0) - length = ctx->num_output_clips + ctx->num_output_culls; - param_index = shader_io_get_unique_index(i); - max_output_written = MAX2(param_index + (length > 4), max_output_written); + max_output_written = MAX2(param_index, max_output_written); } outinfo->esgs_itemsize = (max_output_written + 1) * 16; @@ -2608,9 +3095,7 @@ handle_es_outputs_post(struct radv_shader_context *ctx, if (ctx->ac.chip_class >= GFX9) { unsigned itemsize_dw = outinfo->esgs_itemsize / 4; LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); - LLVMValueRef wave_idx = ac_build_bfe(&ctx->ac, ctx->merged_wave_info, - LLVMConstInt(ctx->ac.i32, 24, false), - LLVMConstInt(ctx->ac.i32, 4, false), false); + LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4); vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, LLVMBuildMul(ctx->ac.builder, wave_idx, LLVMConstInt(ctx->ac.i32, 64, false), ""), ""); @@ -2623,7 +3108,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx, LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; unsigned output_usage_mask; int param_index; - int length = 4; if (!(ctx->output_mask & (1ull << i))) continue; @@ -2637,11 +3121,6 @@ handle_es_outputs_post(struct radv_shader_context *ctx, ctx->shader_info->info.tes.output_usage_mask[i]; } - if (i == VARYING_SLOT_CLIP_DIST0) { - length = ctx->num_output_clips + ctx->num_output_culls; - output_usage_mask = (1 << length) - 1; - } - param_index = shader_io_get_unique_index(i); if (lds_base) { @@ -2650,12 +3129,13 @@ handle_es_outputs_post(struct radv_shader_context *ctx, ""); } - for (j = 0; j < length; j++) { + for (j = 0; j < 4; j++) { if (!(output_usage_mask & (1 << j))) continue; LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - out_val = LLVMBuildBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); + out_val = ac_to_integer(&ctx->ac, out_val); + out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); if (ctx->ac.chip_class >= GFX9) { LLVMValueRef dw_addr_offset = @@ -2663,8 +3143,7 @@ handle_es_outputs_post(struct radv_shader_context *ctx, LLVMConstInt(ctx->ac.i32, j, false), ""); - ac_lds_store(&ctx->ac, dw_addr_offset, - LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "")); + ac_lds_store(&ctx->ac, dw_addr_offset, out_val); } else { ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, @@ -2688,20 +3167,19 @@ handle_ls_outputs_post(struct radv_shader_context *ctx) for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4]; - int length = 4; if (!(ctx->output_mask & (1ull << i))) continue; - if (i == VARYING_SLOT_CLIP_DIST0) - length = ctx->num_output_clips + ctx->num_output_culls; int param = shader_io_get_unique_index(i); LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr, LLVMConstInt(ctx->ac.i32, param * 4, false), ""); - for (unsigned j = 0; j < length; j++) { - ac_lds_store(&ctx->ac, dw_addr, - LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "")); + for (unsigned j = 0; j < 4; j++) { + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); + value = ac_to_integer(&ctx->ac, value); + value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); + ac_lds_store(&ctx->ac, dw_addr, value); dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, ""); } } @@ -2796,7 +3274,7 @@ write_tess_factors(struct radv_shader_context *ctx) LLVMConstInt(ctx->ac.i32, 4 * stride, false), ""); unsigned tf_offset = 0; - if (ctx->options->chip_class <= VI) { + if (ctx->options->chip_class <= GFX8) { ac_nir_build_if(&inner_if_ctx, ctx, LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, "")); @@ -3038,7 +3516,7 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx) static void ac_setup_rings(struct radv_shader_context *ctx) { - if (ctx->options->chip_class <= VI && + if (ctx->options->chip_class <= GFX8 && (ctx->stage == MESA_SHADER_GEOMETRY || ctx->options->key.vs.as_es || ctx->options->key.tes.as_es)) { unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS @@ -3051,24 +3529,76 @@ ac_setup_rings(struct radv_shader_context *ctx) } if (ctx->is_gs_copy_shader) { - ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false)); + ctx->gsvs_ring[0] = + ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, + LLVMConstInt(ctx->ac.i32, + RING_GSVS_VS, false)); } + if (ctx->stage == MESA_SHADER_GEOMETRY) { - LLVMValueRef tmp; - uint32_t num_entries = 64; - LLVMValueRef gsvs_ring_stride = LLVMConstInt(ctx->ac.i32, ctx->max_gsvs_emit_size, false); - LLVMValueRef gsvs_ring_desc = LLVMConstInt(ctx->ac.i32, ctx->max_gsvs_emit_size << 16, false); - ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false)); + /* The conceptual layout of the GSVS ring is + * v0c0 .. vLv0 v0c1 .. vLc1 .. + * but the real memory layout is swizzled across + * threads: + * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL + * t16v0c0 .. + * Override the buffer descriptor accordingly. + */ + LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2); + uint64_t stream_offset = 0; + unsigned num_records = 64; + LLVMValueRef base_ring; + + base_ring = + ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, + LLVMConstInt(ctx->ac.i32, + RING_GSVS_GS, false)); + + for (unsigned stream = 0; stream < 4; stream++) { + unsigned num_components, stride; + LLVMValueRef ring, tmp; + + num_components = + ctx->shader_info->info.gs.num_stream_output_components[stream]; + + if (!num_components) + continue; + + stride = 4 * num_components * ctx->gs_max_out_vertices; + + /* Limit on the stride field for <= GFX7. */ + assert(stride < (1 << 14)); + + ring = LLVMBuildBitCast(ctx->ac.builder, + base_ring, v2i64, ""); + tmp = LLVMBuildExtractElement(ctx->ac.builder, + ring, ctx->ac.i32_0, ""); + tmp = LLVMBuildAdd(ctx->ac.builder, tmp, + LLVMConstInt(ctx->ac.i64, + stream_offset, 0), ""); + ring = LLVMBuildInsertElement(ctx->ac.builder, + ring, tmp, ctx->ac.i32_0, ""); + + stream_offset += stride * 64; - ctx->gsvs_ring = LLVMBuildBitCast(ctx->ac.builder, ctx->gsvs_ring, ctx->ac.v4i32, ""); + ring = LLVMBuildBitCast(ctx->ac.builder, ring, + ctx->ac.v4i32, ""); - tmp = LLVMConstInt(ctx->ac.i32, num_entries, false); - if (ctx->options->chip_class >= VI) - tmp = LLVMBuildMul(ctx->ac.builder, gsvs_ring_stride, tmp, ""); - ctx->gsvs_ring = LLVMBuildInsertElement(ctx->ac.builder, ctx->gsvs_ring, tmp, LLVMConstInt(ctx->ac.i32, 2, false), ""); - tmp = LLVMBuildExtractElement(ctx->ac.builder, ctx->gsvs_ring, ctx->ac.i32_1, ""); - tmp = LLVMBuildOr(ctx->ac.builder, tmp, gsvs_ring_desc, ""); - ctx->gsvs_ring = LLVMBuildInsertElement(ctx->ac.builder, ctx->gsvs_ring, tmp, ctx->ac.i32_1, ""); + tmp = LLVMBuildExtractElement(ctx->ac.builder, ring, + ctx->ac.i32_1, ""); + tmp = LLVMBuildOr(ctx->ac.builder, tmp, + LLVMConstInt(ctx->ac.i32, + S_008F04_STRIDE(stride), false), ""); + ring = LLVMBuildInsertElement(ctx->ac.builder, ring, tmp, + ctx->ac.i32_1, ""); + + ring = LLVMBuildInsertElement(ctx->ac.builder, ring, + LLVMConstInt(ctx->ac.i32, + num_records, false), + LLVMConstInt(ctx->ac.i32, 2, false), ""); + + ctx->gsvs_ring[stream] = ring; + } } if (ctx->stage == MESA_SHADER_TESS_CTRL || @@ -3078,13 +3608,13 @@ ac_setup_rings(struct radv_shader_context *ctx) } } -static unsigned -ac_nir_get_max_workgroup_size(enum chip_class chip_class, - const struct nir_shader *nir) +unsigned +radv_nir_get_max_workgroup_size(enum chip_class chip_class, + const struct nir_shader *nir) { switch (nir->info.stage) { case MESA_SHADER_TESS_CTRL: - return chip_class >= CIK ? 128 : 64; + return chip_class >= GFX7 ? 128 : 64; case MESA_SHADER_GEOMETRY: return chip_class >= GFX9 ? 128 : 64; case MESA_SHADER_COMPUTE: @@ -3102,9 +3632,7 @@ ac_nir_get_max_workgroup_size(enum chip_class chip_class, /* Fixup the HW not emitting the TCS regs if there are no HS threads. */ static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx) { - LLVMValueRef count = ac_build_bfe(&ctx->ac, ctx->merged_wave_info, - LLVMConstInt(ctx->ac.i32, 8, false), - LLVMConstInt(ctx->ac.i32, 8, false), false); + LLVMValueRef count = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 8, 8); LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count, ctx->ac.i32_0, ""); ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, ""); @@ -3115,20 +3643,16 @@ static void ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx) static void prepare_gs_input_vgprs(struct radv_shader_context *ctx) { for(int i = 5; i >= 0; --i) { - ctx->gs_vtx_offset[i] = ac_build_bfe(&ctx->ac, ctx->gs_vtx_offset[i & ~1], - LLVMConstInt(ctx->ac.i32, (i & 1) * 16, false), - LLVMConstInt(ctx->ac.i32, 16, false), false); + ctx->gs_vtx_offset[i] = ac_unpack_param(&ctx->ac, ctx->gs_vtx_offset[i & ~1], + (i & 1) * 16, 16); } - ctx->gs_wave_id = ac_build_bfe(&ctx->ac, ctx->merged_wave_info, - LLVMConstInt(ctx->ac.i32, 16, false), - LLVMConstInt(ctx->ac.i32, 8, false), false); + ctx->gs_wave_id = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 16, 8); } static -LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, - LLVMPassManagerRef passmgr, +LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders, int shader_count, struct radv_shader_variant_info *shader_info, @@ -3138,11 +3662,10 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, unsigned i; ctx.options = options; ctx.shader_info = shader_info; - ctx.context = LLVMContextCreate(); - ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class, - options->family); - ctx.ac.module = ac_create_module(tm, ctx.context); + ac_llvm_context_init(&ctx.ac, options->chip_class, options->family); + ctx.context = ctx.ac.context; + ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context); enum ac_float_mode float_mode = options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : @@ -3152,6 +3675,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, memset(shader_info, 0, sizeof(*shader_info)); + radv_nir_shader_info_init(&shader_info->info); + for(int i = 0; i < shader_count; ++i) radv_nir_shader_info_pass(shaders[i], options, &shader_info->info); @@ -3163,7 +3688,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, ctx.max_workgroup_size = 0; for (int i = 0; i < shader_count; ++i) { ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size, - ac_nir_get_max_workgroup_size(ctx.options->chip_class, + radv_nir_get_max_workgroup_size(ctx.options->chip_class, shaders[i])); } @@ -3178,23 +3703,31 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, ctx.abi.load_sampler_desc = radv_get_sampler_desc; ctx.abi.load_resource = radv_load_resource; ctx.abi.clamp_shadow_reference = false; - ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9; + ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x800; + + /* Because the new raw/struct atomic intrinsics are buggy with LLVM 8, + * we fallback to the old intrinsics for atomic buffer image operations + * and thus we need to apply the indexing workaround... + */ + ctx.abi.gfx9_stride_size_workaround_for_atomic = ctx.ac.chip_class == GFX9 && HAVE_LLVM < 0x900; if (shader_count >= 2) ac_init_exec_full_mask(&ctx.ac); - if (ctx.ac.chip_class == GFX9 && + if ((ctx.ac.family == CHIP_VEGA10 || + ctx.ac.family == CHIP_RAVEN) && shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL) ac_nir_fixup_ls_hs_input_vgprs(&ctx); for(int i = 0; i < shader_count; ++i) { ctx.stage = shaders[i]->info.stage; ctx.output_mask = 0; - ctx.num_output_clips = shaders[i]->info.clip_distance_array_size; - ctx.num_output_culls = shaders[i]->info.cull_distance_array_size; if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) { - ctx.gs_next_vertex = ac_build_alloca(&ctx.ac, ctx.ac.i32, "gs_next_vertex"); + for (int i = 0; i < 4; i++) { + ctx.gs_next_vertex[i] = + ac_build_alloca(&ctx.ac, ctx.ac.i32, ""); + } ctx.gs_max_out_vertices = shaders[i]->info.gs.vertices_out; ctx.abi.load_inputs = load_gs_input; ctx.abi.emit_primitive = visit_end_primitive; @@ -3258,9 +3791,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); - LLVMValueRef count = ac_build_bfe(&ctx.ac, ctx.merged_wave_info, - LLVMConstInt(ctx.ac.i32, 8 * i, false), - LLVMConstInt(ctx.ac.i32, 8, false), false); + LLVMValueRef count = ac_unpack_param(&ctx.ac, ctx.merged_wave_info, 8 * i, 8); LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac); LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT, thread_id, count, ""); @@ -3297,7 +3828,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, if (options->dump_preoptir) ac_dump_module(ctx.ac.module); - ac_llvm_finalize_module(&ctx, passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); if (shader_count == 1) ac_nir_eliminate_const_vs_outputs(&ctx); @@ -3327,15 +3858,10 @@ static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) static unsigned ac_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, - LLVMTargetMachineRef tm) + struct ac_llvm_compiler *ac_llvm) { unsigned retval = 0; - char *err; LLVMContextRef llvm_ctx; - LLVMMemoryBufferRef out_buffer; - unsigned buffer_size; - const char *buffer_data; - LLVMBool mem_err; /* Setup Diagnostic Handler*/ llvm_ctx = LLVMGetModuleContext(M); @@ -3344,31 +3870,12 @@ static unsigned ac_llvm_compile(LLVMModuleRef M, &retval); /* Compile IR*/ - mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, - &err, &out_buffer); - - /* Process Errors/Warnings */ - if (mem_err) { - fprintf(stderr, "%s: %s", __FUNCTION__, err); - free(err); + if (!radv_compile_to_binary(ac_llvm, M, binary)) retval = 1; - goto out; - } - - /* Extract Shader Code*/ - buffer_size = LLVMGetBufferSize(out_buffer); - buffer_data = LLVMGetBufferStart(out_buffer); - - ac_elf_read(buffer_data, buffer_size, binary); - - /* Clean up */ - LLVMDisposeMemoryBuffer(out_buffer); - -out: return retval; } -static void ac_compile_llvm_module(LLVMTargetMachineRef tm, +static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_module, struct ac_shader_binary *binary, struct ac_shader_config *config, @@ -3387,7 +3894,7 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm, LLVMDisposeMessage(llvm_ir); } - int v = ac_llvm_compile(llvm_module, binary, tm); + int v = ac_llvm_compile(llvm_module, binary, ac_llvm); if (v) { fprintf(stderr, "compile failed\n"); } @@ -3452,7 +3959,7 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm, * - Floating-point output modifiers would be ignored by the hw. * - Some opcodes don't support denormals, such as v_mad_f32. We would * have to stop using those. - * - SI & CI would be very slow. + * - GFX6 & GFX7 would be very slow. */ config->float_mode |= V_00B028_FP_64_DENORMS; } @@ -3497,8 +4004,7 @@ ac_fill_shader_info(struct radv_shader_variant_info *shader_info, struct nir_sha } void -radv_compile_nir_shader(LLVMTargetMachineRef tm, - LLVMPassManagerRef passmgr, +radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct ac_shader_binary *binary, struct ac_shader_config *config, struct radv_shader_variant_info *shader_info, @@ -3509,10 +4015,10 @@ radv_compile_nir_shader(LLVMTargetMachineRef tm, LLVMModuleRef llvm_module; - llvm_module = ac_translate_nir_to_llvm(tm, passmgr, nir, nir_count, shader_info, + llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, shader_info, options); - ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, + ac_compile_llvm_module(ac_llvm, llvm_module, binary, config, shader_info, nir[0]->info.stage, options); for (int i = 0; i < nir_count; ++i) @@ -3533,45 +4039,96 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) LLVMValueRef vtx_offset = LLVMBuildMul(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 4, false), ""); - int idx = 0; + LLVMValueRef stream_id; - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { - int length = 4; - int slot = idx; - int slot_inc = 1; - if (!(ctx->output_mask & (1ull << i))) + /* Fetch the vertex stream ID. */ + if (ctx->shader_info->info.so.num_outputs) { + stream_id = + ac_unpack_param(&ctx->ac, ctx->streamout_config, 24, 2); + } else { + stream_id = ctx->ac.i32_0; + } + + LLVMBasicBlockRef end_bb; + LLVMValueRef switch_inst; + + end_bb = LLVMAppendBasicBlockInContext(ctx->ac.context, + ctx->main_function, "end"); + switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4); + + for (unsigned stream = 0; stream < 4; stream++) { + unsigned num_components = + ctx->shader_info->info.gs.num_stream_output_components[stream]; + LLVMBasicBlockRef bb; + unsigned offset; + + if (!num_components) continue; - if (i == VARYING_SLOT_CLIP_DIST0) { - /* unpack clip and cull from a single set of slots */ - length = ctx->num_output_clips + ctx->num_output_culls; - if (length > 4) - slot_inc = 2; - } + if (stream > 0 && !ctx->shader_info->info.so.num_outputs) + continue; - for (unsigned j = 0; j < length; j++) { - LLVMValueRef value, soffset; + bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out"); + LLVMAddCase(switch_inst, LLVMConstInt(ctx->ac.i32, stream, 0), bb); + LLVMPositionBuilderAtEnd(ctx->ac.builder, bb); - soffset = LLVMConstInt(ctx->ac.i32, - (slot * 4 + j) * - ctx->gs_max_out_vertices * 16 * 4, false); + offset = 0; + for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { + unsigned output_usage_mask = + ctx->shader_info->info.gs.output_usage_mask[i]; + unsigned output_stream = + ctx->shader_info->info.gs.output_streams[i]; + int length = util_last_bit(output_usage_mask); + + if (!(ctx->output_mask & (1ull << i)) || + output_stream != stream) + continue; + + for (unsigned j = 0; j < length; j++) { + LLVMValueRef value, soffset; + + if (!(output_usage_mask & (1 << j))) + continue; - value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring, - 1, ctx->ac.i32_0, - vtx_offset, soffset, - 0, 1, 1, true, false); + soffset = LLVMConstInt(ctx->ac.i32, + offset * + ctx->gs_max_out_vertices * 16 * 4, false); + + offset++; + + value = ac_build_buffer_load(&ctx->ac, + ctx->gsvs_ring[0], + 1, ctx->ac.i32_0, + vtx_offset, soffset, + 0, 1, 1, true, false); + + LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); + if (ac_get_type_size(type) == 2) { + value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, ""); + value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, ""); + } - LLVMBuildStore(ctx->ac.builder, - ac_to_float(&ctx->ac, value), ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); + LLVMBuildStore(ctx->ac.builder, + ac_to_float(&ctx->ac, value), ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); + } + } + + if (ctx->shader_info->info.so.num_outputs) + radv_emit_streamout(ctx, stream); + + if (stream == 0) { + handle_vs_outputs_post(ctx, false, false, + &ctx->shader_info->vs.outinfo); } - idx += slot_inc; + + LLVMBuildBr(ctx->ac.builder, end_bb); } - handle_vs_outputs_post(ctx, false, false, &ctx->shader_info->vs.outinfo); + + LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb); } void -radv_compile_gs_copy_shader(LLVMTargetMachineRef tm, - LLVMPassManagerRef passmgr, +radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, struct ac_shader_binary *binary, struct ac_shader_config *config, @@ -3579,13 +4136,12 @@ radv_compile_gs_copy_shader(LLVMTargetMachineRef tm, const struct radv_nir_compiler_options *options) { struct radv_shader_context ctx = {0}; - ctx.context = LLVMContextCreate(); ctx.options = options; ctx.shader_info = shader_info; - ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class, - options->family); - ctx.ac.module = ac_create_module(tm, ctx.context); + ac_llvm_context_init(&ctx.ac, options->chip_class, options->family); + ctx.context = ctx.ac.context; + ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context); ctx.is_gs_copy_shader = true; @@ -3603,9 +4159,6 @@ radv_compile_gs_copy_shader(LLVMTargetMachineRef tm, ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out; ac_setup_rings(&ctx); - ctx.num_output_clips = geom_shader->info.clip_distance_array_size; - ctx.num_output_culls = geom_shader->info.cull_distance_array_size; - nir_foreach_variable(variable, &geom_shader->outputs) { scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX); ac_handle_shader_output_decl(&ctx.ac, &ctx.abi, geom_shader, @@ -3616,8 +4169,8 @@ radv_compile_gs_copy_shader(LLVMTargetMachineRef tm, LLVMBuildRetVoid(ctx.ac.builder); - ac_llvm_finalize_module(&ctx, passmgr, options); + ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options); - ac_compile_llvm_module(tm, ctx.ac.module, binary, config, shader_info, + ac_compile_llvm_module(ac_llvm, ctx.ac.module, binary, config, shader_info, MESA_SHADER_VERTEX, options); }