X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fvulkan%2Fradv_nir_to_llvm.c;h=900246d275ef57a13cf3240a1ae7360cfa45b573;hb=9f005f1f850710ea456f9847b1d247aaa8f0d6d2;hp=11f983974d67507cd3d180c312a1a09c54a60f6c;hpb=66c703b3e8a6a7e3c03e577c8deb377536ce5af2;p=mesa.git diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 11f983974d6..900246d275e 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -29,13 +29,9 @@ #include "radv_shader.h" #include "radv_shader_helper.h" #include "radv_shader_args.h" +#include "radv_debug.h" #include "nir/nir.h" -#include -#include -#include -#include - #include "sid.h" #include "ac_binary.h" #include "ac_llvm_util.h" @@ -117,87 +113,6 @@ static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx) } } -static unsigned -get_tcs_num_patches(struct radv_shader_context *ctx) -{ - unsigned num_tcs_input_cp = ctx->args->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; - uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; - uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * input_vertex_size; - uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); - uint32_t output_vertex_size = num_tcs_outputs * 16; - uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; - uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - unsigned num_patches; - unsigned hardware_lds_size; - - /* Ensure that we only need one wave per SIMD so we don't need to check - * resource usage. Also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256. - */ - num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4; - /* Make sure that the data fits in LDS. This assumes the shaders only - * use LDS for the inputs and outputs. - */ - hardware_lds_size = 32768; - - /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single - * threadgroup, even though there is more than 32 KiB LDS. - * - * Test: dEQP-VK.tessellation.shader_input_output.barrier - */ - if (ctx->args->options->chip_class >= GFX7 && ctx->args->options->family != CHIP_STONEY) - hardware_lds_size = 65536; - - num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); - /* Make sure the output data fits in the offchip buffer */ - num_patches = MIN2(num_patches, (ctx->args->options->tess_offchip_block_dw_size * 4) / output_patch_size); - /* Not necessary for correctness, but improves performance. The - * specific value is taken from the proprietary driver. - */ - num_patches = MIN2(num_patches, 40); - - /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ - if (ctx->args->options->chip_class == GFX6) { - unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); - num_patches = MIN2(num_patches, one_wave); - } - return num_patches; -} - -static unsigned -calculate_tess_lds_size(struct radv_shader_context *ctx) -{ - unsigned num_tcs_input_cp = ctx->args->options->key.tcs.input_vertices; - unsigned num_tcs_output_cp; - unsigned num_tcs_outputs, num_tcs_patch_outputs; - unsigned input_vertex_size, output_vertex_size; - unsigned input_patch_size, output_patch_size; - unsigned pervertex_output_patch_size; - unsigned output_patch0_offset; - unsigned num_patches; - unsigned lds_size; - - num_tcs_output_cp = ctx->shader->info.tess.tcs_vertices_out; - num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); - num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); - - input_vertex_size = ctx->tcs_num_inputs * 16; - output_vertex_size = num_tcs_outputs * 16; - - input_patch_size = num_tcs_input_cp * input_vertex_size; - - pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; - output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - - num_patches = ctx->tcs_num_patches; - output_patch0_offset = input_patch_size * num_patches; - - lds_size = output_patch0_offset + output_patch_size * num_patches; - return lds_size; -} - /* Tessellation shaders pass outputs to the next shader using LDS. * * LS outputs = TCS inputs @@ -411,15 +326,11 @@ static void create_function(struct radv_shader_context *ctx, ctx->max_workgroup_size, ctx->args->options); - if (ctx->args->options->supports_spill) { - ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", - LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), - NULL, 0, AC_FUNC_ATTR_READNONE); - ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, - ac_array_in_const_addr_space(ctx->ac.v4i32), ""); - } else if (ctx->args->ring_offsets.used) { - ctx->ring_offsets = ac_get_arg(&ctx->ac, ctx->args->ring_offsets); - } + ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr", + LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), + NULL, 0, AC_FUNC_ATTR_READNONE); + ctx->ring_offsets = LLVMBuildBitCast(ctx->ac.builder, ctx->ring_offsets, + ac_array_in_const_addr_space(ctx->ac.v4i32), ""); load_descriptor_sets(ctx); @@ -472,7 +383,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, if (ctx->ac.chip_class >= GFX10) { desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else { desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -746,13 +657,13 @@ store_tcs_output(struct ac_shader_abi *abi, if (!is_tess_factor && writemask != 0xF) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1, buf_addr, oc_lds, - 4 * (base + chan), ac_glc, false); + 4 * (base + chan), ac_glc); } if (writemask == 0xF) { ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4, buf_addr, oc_lds, - (base * 4), ac_glc, false); + (base * 4), ac_glc); } } @@ -884,13 +795,6 @@ load_gs_input(struct ac_shader_abi *abi, return result; } - -static void radv_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible) -{ - struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); - ac_build_kill_if_false(&ctx->ac, visible); -} - static uint32_t radv_get_sample_pos_offset(uint32_t num_samples) { @@ -972,39 +876,21 @@ static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi) static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs); static void -visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs) +visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs) { - LLVMValueRef gs_next_vertex; - LLVMValueRef can_emit; unsigned offset = 0; struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); if (ctx->args->options->key.vs_common_out.as_ngg) { - gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); + gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs); return; } - /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, - ctx->gs_next_vertex[stream], - ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, don't emit any more: excessive vertex emissions are not - * supposed to have any effect. - */ - can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - - bool use_kill = !ctx->args->shader_info->gs.writes_memory; - if (use_kill) - ac_build_kill_if_false(&ctx->ac, can_emit); - else - ac_build_ifcc(&ctx->ac, can_emit, 6505); - for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i]; @@ -1029,7 +915,7 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr offset++; - voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); + voffset = LLVMBuildAdd(ctx->ac.builder, voffset, vertexidx, ""); voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), ""); out_val = ac_to_integer(&ctx->ac, out_val); @@ -1041,20 +927,13 @@ visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addr voffset, ac_get_arg(&ctx->ac, ctx->args->gs2vs_offset), - 0, ac_glc | ac_slc, true); + 0, ac_glc | ac_slc | ac_swizzled); } } - gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, - ctx->ac.i32_1, ""); - LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id); - - if (!use_kill) - ac_build_endif(&ctx->ac, 6505); } static void @@ -1284,35 +1163,6 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, ""); } -static unsigned -get_num_channels_from_data_format(unsigned data_format) -{ - switch (data_format) { - case V_008F0C_BUF_DATA_FORMAT_8: - case V_008F0C_BUF_DATA_FORMAT_16: - case V_008F0C_BUF_DATA_FORMAT_32: - return 1; - case V_008F0C_BUF_DATA_FORMAT_8_8: - case V_008F0C_BUF_DATA_FORMAT_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32: - return 2; - case V_008F0C_BUF_DATA_FORMAT_10_11_11: - case V_008F0C_BUF_DATA_FORMAT_11_11_10: - case V_008F0C_BUF_DATA_FORMAT_32_32_32: - return 3; - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: - case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: - return 4; - default: - break; - } - - return 4; -} - static LLVMValueRef radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef value, @@ -1334,10 +1184,8 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, for (unsigned i = 0; i < num_channels; i++) chan[i] = ac_llvm_extract_elem(&ctx->ac, value, i); } else { - if (num_channels) { - assert(num_channels == 1); - chan[0] = value; - } + assert(num_channels == 1); + chan[0] = value; } for (unsigned i = num_channels; i < 4; i++) { @@ -1399,11 +1247,12 @@ handle_vs_input_decl(struct radv_shader_context *ctx, ctx->args->ac.base_vertex), ""); } + const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format); + /* Adjust the number of channels to load based on the vertex * attribute format. */ - unsigned num_format_channels = get_num_channels_from_data_format(data_format); - unsigned num_channels = MIN2(num_input_channels, num_format_channels); + unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels); unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index]; unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index]; unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index]; @@ -1415,27 +1264,70 @@ handle_vs_input_decl(struct radv_shader_context *ctx, num_channels = MAX2(num_channels, 3); } - if (attrib_stride != 0 && attrib_offset > attrib_stride) { - LLVMValueRef buffer_offset = - LLVMConstInt(ctx->ac.i32, - attrib_offset / attrib_stride, false); + t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); + t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); - buffer_index = LLVMBuildAdd(ctx->ac.builder, - buffer_index, - buffer_offset, ""); + /* Perform per-channel vertex fetch operations if unaligned + * access are detected. Only GFX6 and GFX10 are affected. + */ + bool unaligned_vertex_fetches = false; + if ((ctx->ac.chip_class == GFX6 || ctx->ac.chip_class == GFX10) && + vtx_info->chan_format != data_format && + ((attrib_offset % vtx_info->element_size) || + (attrib_stride % vtx_info->element_size))) + unaligned_vertex_fetches = true; + + if (unaligned_vertex_fetches) { + unsigned chan_format = vtx_info->chan_format; + LLVMValueRef values[4]; - attrib_offset = attrib_offset % attrib_stride; - } + assert(ctx->ac.chip_class == GFX6 || + ctx->ac.chip_class == GFX10); - t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false); - t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset); + for (unsigned chan = 0; chan < num_channels; chan++) { + unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size; + LLVMValueRef chan_index = buffer_index; - input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, - buffer_index, - LLVMConstInt(ctx->ac.i32, attrib_offset, false), - ctx->ac.i32_0, ctx->ac.i32_0, - num_channels, - data_format, num_format, 0, true); + if (attrib_stride != 0 && chan_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + chan_offset / attrib_stride, false); + + chan_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); + + chan_offset = chan_offset % attrib_stride; + } + + values[chan] = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + chan_index, + LLVMConstInt(ctx->ac.i32, chan_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, 1, + chan_format, num_format, 0, true); + } + + input = ac_build_gather_values(&ctx->ac, values, num_channels); + } else { + if (attrib_stride != 0 && attrib_offset > attrib_stride) { + LLVMValueRef buffer_offset = + LLVMConstInt(ctx->ac.i32, + attrib_offset / attrib_stride, false); + + buffer_index = LLVMBuildAdd(ctx->ac.builder, + buffer_index, + buffer_offset, ""); + + attrib_offset = attrib_offset % attrib_stride; + } + + input = ac_build_struct_tbuffer_load(&ctx->ac, t_list, + buffer_index, + LLVMConstInt(ctx->ac.i32, attrib_offset, false), + ctx->ac.i32_0, ctx->ac.i32_0, + num_channels, + data_format, num_format, 0, true); + } if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) { LLVMValueRef c[4]; @@ -1772,7 +1664,7 @@ radv_emit_stream_output(struct radv_shader_context *ctx, ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf], vdata, num_comps, so_write_offsets[buf], ctx->ac.i32_0, offset, - ac_glc | ac_slc, false); + ac_glc | ac_slc); } static void @@ -2177,7 +2069,7 @@ handle_es_outputs_post(struct radv_shader_context *ctx, NULL, ac_get_arg(&ctx->ac, ctx->args->es2gs_offset), (4 * param_index + j) * 4, - ac_glc | ac_slc, true); + ac_glc | ac_slc | ac_swizzled); } } } @@ -2252,7 +2144,7 @@ static LLVMValueRef ngg_get_ordered_id(struct radv_shader_context *ctx) { return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), ctx->ac.i32_0, - LLVMConstInt(ctx->ac.i32, 11, false), + LLVMConstInt(ctx->ac.i32, 12, false), false); } @@ -2338,101 +2230,28 @@ ngg_gs_emit_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef gsthread, return ngg_gs_vertex_ptr(ctx, vertexidx); } -/* Send GS Alloc Req message from the first wave of the group to SPI. - * Message payload is: - * - bits 0..10: vertices in group - * - bits 12..22: primitives in group - */ -static void build_sendmsg_gs_alloc_req(struct radv_shader_context *ctx, - LLVMValueRef vtx_cnt, - LLVMValueRef prim_cnt) +static LLVMValueRef +ngg_gs_get_emit_output_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned out_idx) { - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef tmp; - - tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ""); - ac_build_ifcc(&ctx->ac, tmp, 5020); - - tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),""); - tmp = LLVMBuildOr(builder, tmp, vtx_cnt, ""); - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp); - - ac_build_endif(&ctx->ac, 5020); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_0, /* first struct entry */ + LLVMConstInt(ctx->ac.i32, out_idx, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); } -struct ngg_prim { - unsigned num_vertices; - LLVMValueRef isnull; - LLVMValueRef swap; - LLVMValueRef index[3]; - LLVMValueRef edgeflag[3]; -}; - -static void build_export_prim(struct radv_shader_context *ctx, - const struct ngg_prim *prim) +static LLVMValueRef +ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef vertexptr, + unsigned stream) { - LLVMBuilderRef builder = ctx->ac.builder; - struct ac_export_args args; - LLVMValueRef vertices[3]; - LLVMValueRef odd, even; - LLVMValueRef tmp; - - tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, ""); - args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), ""); - - for (unsigned i = 0; i < prim->num_vertices; ++i) { - tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, ""); - tmp = LLVMBuildShl(builder, tmp, - LLVMConstInt(ctx->ac.i32, 9, false), ""); - vertices[i] = LLVMBuildOr(builder, prim->index[i], tmp, ""); - } - - switch (prim->num_vertices) { - case 1: - args.out[0] = LLVMBuildOr(builder, args.out[0], vertices[0], ""); - break; - case 2: - tmp = LLVMBuildShl(builder, vertices[1], - LLVMConstInt(ctx->ac.i32, 10, false), ""); - tmp = LLVMBuildOr(builder, args.out[0], tmp, ""); - args.out[0] = LLVMBuildOr(builder, tmp, vertices[0], ""); - break; - case 3: - /* Swap vertices if needed to follow drawing order. */ - tmp = LLVMBuildShl(builder, vertices[2], - LLVMConstInt(ctx->ac.i32, 20, false), ""); - even = LLVMBuildOr(builder, args.out[0], tmp, ""); - tmp = LLVMBuildShl(builder, vertices[1], - LLVMConstInt(ctx->ac.i32, 10, false), ""); - even = LLVMBuildOr(builder, even, tmp, ""); - even = LLVMBuildOr(builder, even, vertices[0], ""); - - tmp = LLVMBuildShl(builder, vertices[1], - LLVMConstInt(ctx->ac.i32, 20, false), ""); - odd = LLVMBuildOr(builder, args.out[0], tmp, ""); - tmp = LLVMBuildShl(builder, vertices[2], - LLVMConstInt(ctx->ac.i32, 10, false), ""); - odd = LLVMBuildOr(builder, odd, tmp, ""); - odd = LLVMBuildOr(builder, odd, vertices[0], ""); - - args.out[0] = LLVMBuildSelect(builder, prim->swap, odd, even, ""); - break; - default: - unreachable("invalid number of vertices"); - } - - args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, ""); - args.out[1] = LLVMGetUndef(ctx->ac.f32); - args.out[2] = LLVMGetUndef(ctx->ac.f32); - args.out[3] = LLVMGetUndef(ctx->ac.f32); - - args.target = V_008DFC_SQ_EXP_PRIM; - args.enabled_channels = 1; - args.done = true; - args.valid_mask = false; - args.compr = false; - - ac_build_export(&ctx->ac, &args); + LLVMValueRef gep_idx[3] = { + ctx->ac.i32_0, /* implied C-style array */ + ctx->ac.i32_1, /* second struct entry */ + LLVMConstInt(ctx->ac.i32, stream, false), + }; + return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, ""); } static struct radv_stream_output * @@ -3021,17 +2840,11 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) /* TODO: primitive culling */ - build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx)); /* TODO: streamout queries */ - /* Export primitive data to the index buffer. Format is: - * - bits 0..8: index 0 - * - bit 9: edge flag 0 - * - bits 10..18: index 1 - * - bit 19: edge flag 1 - * - bits 20..28: index 2 - * - bit 29: edge flag 2 - * - bit 31: null primitive (skip) + /* Export primitive data to the index buffer. * * For the first version, we will always build up all three indices * independent of the primitive type. The additional garbage data @@ -3042,21 +2855,24 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx) */ ac_build_ifcc(&ctx->ac, is_gs_thread, 6001); { - struct ngg_prim prim = {}; - - prim.num_vertices = num_vertices; - prim.isnull = ctx->ac.i1false; - prim.swap = ctx->ac.i1false; - memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + struct ac_ngg_prim prim = {}; - for (unsigned i = 0; i < num_vertices; ++i) { - tmp = LLVMBuildLShr(builder, - ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id), - LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); - prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) { + prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]); + } else { + prim.num_vertices = num_vertices; + prim.isnull = ctx->ac.i1false; + memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3); + + for (unsigned i = 0; i < num_vertices; ++i) { + tmp = LLVMBuildLShr(builder, + ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id), + LLVMConstInt(ctx->ac.i32, 8 + i, false), ""); + prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + } } - build_export_prim(ctx, &prim); + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 6001); @@ -3167,13 +2983,8 @@ static void gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx) LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second entry of struct */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - LLVMBuildStore(builder, i8_0, tmp); + LLVMBuildStore(builder, i8_0, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream)); ac_build_endloop(&ctx->ac, 5100); } @@ -3225,13 +3036,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) continue; - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), ""); tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, ""); @@ -3247,6 +3053,33 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) build_streamout(ctx, &nggso); } + /* Write shader query data. */ + tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5109); + tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, + LLVMConstInt(ctx->ac.i32, 4, false), ""); + ac_build_ifcc(&ctx->ac, tmp, 5110); + { + tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), ""); + + ac_llvm_add_target_dep_function_attr(ctx->main_function, + "amdgpu-gds-size", 256); + + LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS); + LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, ""); + + const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup"; + + /* Use a plain GDS atomic to accumulate the number of generated + * primitives. + */ + ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase, + tmp, sync_scope); + } + ac_build_endif(&ctx->ac, 5110); + ac_build_endif(&ctx->ac, 5109); + /* TODO: culling */ /* Determine vertex liveness. */ @@ -3267,13 +3100,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) /* Load primitive liveness */ tmp = ngg_gs_vertex_ptr(ctx, primidx); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* stream 0 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); @@ -3319,7 +3147,8 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) * there are 4 or more contiguous null primitives in the export * (in the common case of single-dword prim exports). */ - build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads); + ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), + vertlive_scan.result_reduce, num_emit_threads); /* Setup the reverse vertex compaction permutation. We re-use stream 1 * of the primitive liveness flags, relying on the fact that each @@ -3327,14 +3156,9 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) ac_build_ifcc(&ctx->ac, vertlive, 5130); { tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1 */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp2, tmp); + LLVMBuildStore(builder, tmp2, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1)); } ac_build_endif(&ctx->ac, 5130); @@ -3344,22 +3168,14 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, ""); ac_build_ifcc(&ctx->ac, tmp, 5140); { - struct ngg_prim prim = {}; + LLVMValueRef flags; + struct ac_ngg_prim prim = {}; prim.num_vertices = verts_per_prim; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_0, /* primflag */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); - prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp, - LLVMConstInt(ctx->ac.i8, 0, false), ""); - prim.swap = LLVMBuildICmp(builder, LLVMIntEQ, - LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->ac.i32, 1, false), ""), - LLVMConstInt(ctx->ac.i32, 1, false), ""); + flags = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), ""); + prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), ""); for (unsigned i = 0; i < verts_per_prim; ++i) { prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive, @@ -3367,7 +3183,25 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) prim.edgeflag[i] = ctx->ac.i1false; } - build_export_prim(ctx, &prim); + /* Geometry shaders output triangle strips, but NGG expects + * triangles. We need to change the vertex order for odd + * triangles to get correct front/back facing by swapping 2 + * vertex indices, but we also have to keep the provoking + * vertex in the same place. + */ + if (verts_per_prim == 3) { + LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, ""); + is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, ""); + + struct ac_ngg_prim in = prim; + prim.index[0] = in.index[0]; + prim.index[1] = LLVMBuildSelect(builder, is_odd, + in.index[2], in.index[1], ""); + prim.index[2] = LLVMBuildSelect(builder, is_odd, + in.index[1], in.index[2], ""); + } + + ac_build_export_prim(&ctx->ac, &prim); } ac_build_endif(&ctx->ac, 5140); @@ -3389,18 +3223,12 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) outinfo->pos_exports = 0; tmp = ngg_gs_vertex_ptr(ctx, tid); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implicit C-style array */ - ctx->ac.i32_1, /* second value of struct */ - ctx->ac.i32_1, /* stream 1: source data index */ - }; - tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, ""); - tmp = LLVMBuildLoad(builder, tmp, ""); + tmp = LLVMBuildLoad(builder, + ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), ""); tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, ""); const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp); unsigned out_idx = 0; - gep_idx[1] = ctx->ac.i32_0; for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) { unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i]; @@ -3417,8 +3245,7 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) if (!(output_usage_mask & (1 << j))) continue; - gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false); - tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); + tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx); tmp = LLVMBuildLoad(builder, tmp, ""); LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]); @@ -3457,25 +3284,11 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx) static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, + LLVMValueRef vertexidx, LLVMValueRef *addrs) { LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef tmp; - const LLVMValueRef vertexidx = - LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, skip the write: excessive vertex emissions are not - * supposed to have any effect. - */ - const LLVMValueRef can_emit = - LLVMBuildICmp(builder, LLVMIntULT, vertexidx, - LLVMConstInt(ctx->ac.i32, ctx->shader->info.gs.vertices_out, false), ""); - ac_build_ifcc(&ctx->ac, can_emit, 9001); - - tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); - tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, ""); - LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx); @@ -3498,21 +3311,22 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_0, /* first entry of struct */ - LLVMConstInt(ctx->ac.i32, out_idx, false), - }; - LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - out_val = ac_to_integer(&ctx->ac, out_val); out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, ""); - LLVMBuildStore(builder, out_val, ptr); + LLVMBuildStore(builder, out_val, + ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx)); } } assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size); + /* Store the current number of emitted vertices to zero out remaining + * primitive flags in case the geometry shader doesn't emit the maximum + * number of vertices. + */ + tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, ""); + LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]); + /* Determine and store whether this vertex completed a primitive. */ const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], ""); @@ -3520,25 +3334,35 @@ static void gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, ""); + /* Since the geometry shader emits triangle strips, we need to + * track which primitive is odd and swap vertex indices to get + * the correct vertex order. + */ + LLVMValueRef is_odd = ctx->ac.i1false; + if (stream == 0 && + si_conv_gl_prim_to_vertices(ctx->shader->info.gs.output_primitive) == 3) { + tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, ""); + is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, ""); + } + tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, ""); LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]); - LLVMValueRef gep_idx[3] = { - ctx->ac.i32_0, /* implied C-style array */ - ctx->ac.i32_1, /* second struct entry */ - LLVMConstInt(ctx->ac.i32, stream, false), - }; - const LLVMValueRef primflagptr = - LLVMBuildGEP(builder, vertexptr, gep_idx, 3, ""); - + /* The per-vertex primitive flag encoding: + * bit 0: whether this vertex finishes a primitive + * bit 1: whether the primitive is odd (if we are emitting triangle strips) + */ tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, ""); - LLVMBuildStore(builder, tmp, primflagptr); + tmp = LLVMBuildOr(builder, tmp, + LLVMBuildShl(builder, + LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), + ctx->ac.i8_1, ""), ""); + LLVMBuildStore(builder, tmp, + ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream)); tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], ""); tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), ""); LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]); - - ac_build_endif(&ctx->ac, 9001); } static void @@ -3639,7 +3463,7 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, false), 1, ctx->ac.i32_0, tf_base, - 0, ac_glc, false); + 0, ac_glc); tf_offset += 4; ac_build_endif(&ctx->ac, 6504); @@ -3648,11 +3472,11 @@ write_tess_factors(struct radv_shader_context *ctx) /* Store the tessellation factors. */ ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, - tf_offset, ac_glc, false); + tf_offset, ac_glc); if (vec1) ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, - 16 + tf_offset, ac_glc, false); + 16 + tf_offset, ac_glc); //store to offchip for TES to read - only if TES reads them if (ctx->args->options->key.tcs.tes_reads_tess_factors) { @@ -3670,7 +3494,7 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec, outer_comps, tf_outer_offset, ac_get_arg(&ctx->ac, ctx->args->oc_lds), - 0, ac_glc, false); + 0, ac_glc); if (inner_comps) { param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER); tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL, @@ -3681,7 +3505,7 @@ write_tess_factors(struct radv_shader_context *ctx) ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec, inner_comps, tf_inner_offset, ac_get_arg(&ctx->ac, ctx->args->oc_lds), - 0, ac_glc, false); + 0, ac_glc); } } @@ -4067,7 +3891,8 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family, float_mode, - args->shader_info->wave_size, 64); + args->shader_info->wave_size, + args->shader_info->ballot_bit_size); ctx.context = ctx.ac.context; ctx.max_workgroup_size = 0; @@ -4089,7 +3914,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.abi.inputs = &ctx.inputs[0]; ctx.abi.emit_outputs = handle_shader_outputs_post; - ctx.abi.emit_vertex = visit_emit_vertex; + ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter; ctx.abi.load_ubo = radv_load_ubo; ctx.abi.load_ssbo = radv_load_ssbo; ctx.abi.load_sampler_desc = radv_get_sampler_desc; @@ -4120,16 +3945,19 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, * Add an extra dword per vertex to ensure an odd stride, which * avoids bank conflicts for SoA accesses. */ - declare_esgs_ring(&ctx); + if (!args->options->key.vs_common_out.as_ngg_passthrough) + declare_esgs_ring(&ctx); /* This is really only needed when streamout and / or vertex * compaction is enabled. */ - LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8); - ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, - asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32)); - LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + if (args->shader_info->so.num_outputs) { + LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8); + ctx.gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx.ac.module, + asi32, "ngg_scratch", AC_ADDR_SPACE_LDS); + LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32)); + LLVMSetAlignment(ctx.gs_ngg_scratch, 4); + } } for(int i = 0; i < shader_count; ++i) { @@ -4177,7 +4005,16 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ctx.tcs_num_inputs = args->options->key.tcs.num_inputs; else ctx.tcs_num_inputs = util_last_bit64(args->shader_info->vs.ls_outputs_written); - ctx.tcs_num_patches = get_tcs_num_patches(&ctx); + ctx.tcs_num_patches = + get_tcs_num_patches( + ctx.args->options->key.tcs.input_vertices, + ctx.shader->info.tess.tcs_vertices_out, + ctx.tcs_num_inputs, + ctx.args->shader_info->tcs.outputs_written, + ctx.args->shader_info->tcs.patch_outputs_written, + ctx.args->options->tess_offchip_block_dw_size, + ctx.args->options->chip_class, + ctx.args->options->family); } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) { ctx.abi.load_tess_varyings = load_tes_input; ctx.abi.load_tess_coord = load_tess_coord; @@ -4188,7 +4025,6 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) { ctx.abi.load_sample_position = load_sample_position; ctx.abi.load_sample_mask_in = load_sample_mask_in; - ctx.abi.emit_kill = radv_emit_kill; } if (shaders[i]->info.stage == MESA_SHADER_VERTEX && @@ -4235,7 +4071,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, ac_setup_rings(&ctx); - LLVMBasicBlockRef merge_block; + LLVMBasicBlockRef merge_block = NULL; if (shader_count >= 2 || is_ngg) { LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder)); LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, ""); @@ -4280,7 +4116,14 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) { args->shader_info->tcs.num_patches = ctx.tcs_num_patches; - args->shader_info->tcs.lds_size = calculate_tess_lds_size(&ctx); + args->shader_info->tcs.lds_size = + calculate_tess_lds_size( + ctx.args->options->key.tcs.input_vertices, + ctx.shader->info.tess.tcs_vertices_out, + ctx.tcs_num_inputs, + ctx.tcs_num_patches, + ctx.args->shader_info->tcs.outputs_written, + ctx.args->shader_info->tcs.patch_outputs_written); } } @@ -4391,7 +4234,7 @@ static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, free(elf_buffer); } -void +static void radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary, const struct radv_shader_args *args, @@ -4517,7 +4360,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx) LLVMPositionBuilderAtEnd(ctx->ac.builder, end_bb); } -void +static void radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader, struct radv_shader_binary **rbinary, @@ -4556,3 +4399,36 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, (*rbinary)->is_gs_copy_shader = true; } + +void +llvm_compile_shader(struct radv_device *device, + unsigned shader_count, + struct nir_shader *const *shaders, + struct radv_shader_binary **binary, + struct radv_shader_args *args) +{ + enum ac_target_machine_options tm_options = 0; + struct ac_llvm_compiler ac_llvm; + bool thread_compiler; + + tm_options |= AC_TM_SUPPORTS_SPILL; + if (args->options->check_ir) + tm_options |= AC_TM_CHECK_IR; + if (device->instance->debug_flags & RADV_DEBUG_NO_LOAD_STORE_OPT) + tm_options |= AC_TM_NO_LOAD_STORE_OPT; + + thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM); + + radv_init_llvm_compiler(&ac_llvm, thread_compiler, + args->options->family, tm_options, + args->shader_info->wave_size); + + if (args->is_gs_copy_shader) { + radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args); + } else { + radv_compile_nir_shader(&ac_llvm, binary, args, + shaders, shader_count); + } + + radv_destroy_llvm_compiler(&ac_llvm, thread_compiler); +}