From: Marek Olšák Date: Wed, 15 Jan 2020 01:03:48 +0000 (-0500) Subject: radeonsi: move geometry shader code into si_shader_llvm_gs.c X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=da2c12af4b9cfb93bd6880cfa4a218c44b79fe13 radeonsi: move geometry shader code into si_shader_llvm_gs.c Reviewed-by: Timothy Arceri Part-of: --- diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index eca13c29784..152e8c74689 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -37,6 +37,7 @@ C_SOURCES := \ si_shader_internal.h \ si_shader_llvm.c \ si_shader_llvm_build.c \ + si_shader_llvm_gs.c \ si_shader_llvm_ps.c \ si_shader_llvm_tess.c \ si_shader_nir.c \ diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index cca69c58b2c..9bf63f57b2c 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -52,6 +52,7 @@ files_libradeonsi = files( 'si_shader_internal.h', 'si_shader_llvm.c', 'si_shader_llvm_build.c', + 'si_shader_llvm_gs.c', 'si_shader_llvm_ps.c', 'si_shader_llvm_tess.c', 'si_shader_nir.c', diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index cd352ec4cb2..ab6751f44b4 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -49,8 +49,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f); static void si_build_vs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key); -static void si_fix_resource_usage(struct si_screen *sscreen, - struct si_shader *shader); /** Whether the shader runs as a combination of multiple API shaders */ static bool is_multi_part_shader(struct si_shader_context *ctx) @@ -428,122 +426,6 @@ LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, } } -static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, - unsigned input_index, - unsigned vtx_offset_param, - LLVMTypeRef type, - unsigned swizzle) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *shader = ctx->shader; - LLVMValueRef vtx_offset, soffset; - struct si_shader_info *info = &shader->selector->info; - unsigned semantic_name = info->input_semantic_name[input_index]; - unsigned semantic_index = info->input_semantic_index[input_index]; - unsigned param; - LLVMValueRef value; - - param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); - - /* GFX9 has the ESGS ring in LDS. */ - if (ctx->screen->info.chip_class >= GFX9) { - unsigned index = vtx_offset_param; - - switch (index / 2) { - case 0: - vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, - index % 2 ? 16 : 0, 16); - break; - case 1: - vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, - index % 2 ? 16 : 0, 16); - break; - case 2: - vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, - index % 2 ? 16 : 0, 16); - break; - default: - assert(0); - return NULL; - } - - unsigned offset = param * 4 + swizzle; - vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, - LLVMConstInt(ctx->i32, offset, false), ""); - - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); - LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - if (ac_get_type_size(type) == 64) { - ptr = LLVMBuildGEP(ctx->ac.builder, ptr, - &ctx->ac.i32_1, 1, ""); - LLVMValueRef values[2] = { - value, - LLVMBuildLoad(ctx->ac.builder, ptr, "") - }; - value = ac_build_gather_values(&ctx->ac, values, 2); - } - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); - } - - /* GFX6: input load from the ESGS ring in memory. */ - if (swizzle == ~0) { - LLVMValueRef values[4]; - unsigned chan; - for (chan = 0; chan < 4; chan++) { - values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, - type, chan); - } - return ac_build_gather_values(&ctx->ac, values, 4); - } - - /* Get the vertex offset parameter on GFX6. */ - LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, - ctx->gs_vtx_offset[vtx_offset_param]); - - vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, - LLVMConstInt(ctx->i32, 4, 0), ""); - - soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0); - - value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0, - vtx_offset, soffset, 0, ac_glc, true, false); - if (ac_get_type_size(type) == 64) { - LLVMValueRef value2; - soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0); - - value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, - ctx->i32_0, vtx_offset, soffset, - 0, ac_glc, true, false); - return si_build_gather_64bit(ctx, type, value, value2); - } - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); -} - -static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, - unsigned location, - unsigned driver_location, - unsigned component, - unsigned num_components, - unsigned vertex_index, - unsigned const_index, - LLVMTypeRef type) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (ac_get_type_size(type) == 64) - offset *= 2; - - offset += component; - value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index, - vertex_index, type, offset); - } - - return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); -} - static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) { struct si_shader_context *ctx = si_shader_context_from_abi(abi); @@ -816,9 +698,9 @@ void si_emit_streamout_output(struct si_shader_context *ctx, * Write streamout data to buffers for vertex stream @p stream (different * vertex streams can occur for GS copy shaders). */ -static void si_llvm_emit_streamout(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput, unsigned stream) +void si_llvm_emit_streamout(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream) { struct si_shader_selector *sel = ctx->shader->selector; struct pipe_stream_output_info *so = &sel->so; @@ -1178,141 +1060,6 @@ void si_llvm_export_vs(struct si_shader_context *ctx, si_build_param_exports(ctx, outputs, noutput); } -/* Pass GS inputs from ES to GS on GFX9. */ -static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) -{ - LLVMValueRef ret = ctx->return_value; - - ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); - ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); - if (ctx->shader->key.as_ngg) - ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2); - else - ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2); - ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); - ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); - - ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, - 8 + SI_SGPR_RW_BUFFERS); - ret = si_insert_input_ptr(ctx, ret, - ctx->bindless_samplers_and_images, - 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); - if (ctx->screen->use_ngg) { - ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, - 8 + SI_SGPR_VS_STATE_BITS); - } - - unsigned vgpr; - if (ctx->type == PIPE_SHADER_VERTEX) - vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; - else - vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; - - ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); - ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++); - ctx->return_value = ret; -} - -static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader *es = ctx->shader; - struct si_shader_info *info = &es->selector->info; - LLVMValueRef lds_base = NULL; - unsigned chan; - int i; - - if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { - unsigned itemsize_dw = es->selector->esgs_itemsize / 4; - LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); - LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); - vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, - LLVMBuildMul(ctx->ac.builder, wave_idx, - LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), ""); - lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, - LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); - } - - for (i = 0; i < info->num_outputs; i++) { - int param; - - if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || - info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) - continue; - - param = si_shader_io_get_unique_index(info->output_semantic_name[i], - info->output_semantic_index[i], false); - - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan))) - continue; - - LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - out_val = ac_to_integer(&ctx->ac, out_val); - - /* GFX9 has the ESGS ring in LDS. */ - if (ctx->screen->info.chip_class >= GFX9) { - LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false); - idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); - ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); - continue; - } - - ac_build_buffer_store_dword(&ctx->ac, - ctx->esgs_ring, - out_val, 1, NULL, - ac_get_arg(&ctx->ac, ctx->es2gs_offset), - (4 * param + chan) * 4, - ac_glc | ac_slc | ac_swizzled); - } - } - - if (ctx->screen->info.chip_class >= GFX9) - si_set_es_return_value_for_gs(ctx); -} - -static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) -{ - if (ctx->screen->info.chip_class >= GFX9) - return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8); - else - return ac_get_arg(&ctx->ac, ctx->gs_wave_id); -} - -static void emit_gs_epilogue(struct si_shader_context *ctx) -{ - if (ctx->shader->key.as_ngg) { - gfx10_ngg_gs_emit_epilogue(ctx); - return; - } - - if (ctx->screen->info.chip_class >= GFX10) - LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); - - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, - si_get_gs_wave_id(ctx)); - - if (ctx->screen->info.chip_class >= GFX9) - ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); -} - -static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info UNUSED *info = &ctx->shader->selector->info; - - assert(info->num_outputs <= max_outputs); - - emit_gs_epilogue(ctx); -} - static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs) @@ -1389,106 +1136,6 @@ static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, ctx->return_value = ret; } -/* Emit one vertex from the geometry shader */ -static void si_llvm_emit_vertex(struct ac_shader_abi *abi, - unsigned stream, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - if (ctx->shader->key.as_ngg) { - gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); - return; - } - - struct si_shader_info *info = &ctx->shader->selector->info; - struct si_shader *shader = ctx->shader; - LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset); - LLVMValueRef gs_next_vertex; - LLVMValueRef can_emit; - unsigned chan, offset; - int i; - - /* Write vertex attribute values to GSVS ring */ - gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, - ctx->gs_next_vertex[stream], - ""); - - /* If this thread has already emitted the declared maximum number of - * vertices, skip the write: excessive vertex emissions are not - * supposed to have any effect. - * - * If the shader has no writes to memory, kill it instead. This skips - * further memory loads and may allow LLVM to skip to the end - * altogether. - */ - can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, - LLVMConstInt(ctx->i32, - shader->selector->gs_max_out_vertices, 0), ""); - - bool use_kill = !info->writes_memory; - if (use_kill) { - ac_build_kill_if_false(&ctx->ac, can_emit); - } else { - ac_build_ifcc(&ctx->ac, can_emit, 6505); - } - - offset = 0; - for (i = 0; i < info->num_outputs; i++) { - for (chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan)) || - ((info->output_streams[i] >> (2 * chan)) & 3) != stream) - continue; - - LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - LLVMValueRef voffset = - LLVMConstInt(ctx->i32, offset * - shader->selector->gs_max_out_vertices, 0); - offset++; - - voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); - voffset = LLVMBuildMul(ctx->ac.builder, voffset, - LLVMConstInt(ctx->i32, 4, 0), ""); - - out_val = ac_to_integer(&ctx->ac, out_val); - - ac_build_buffer_store_dword(&ctx->ac, - ctx->gsvs_ring[stream], - out_val, 1, - voffset, soffset, 0, - ac_glc | ac_slc | ac_swizzled); - } - } - - gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, ""); - LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); - - /* Signal vertex emission if vertex data was written. */ - if (offset) { - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), - si_get_gs_wave_id(ctx)); - } - - if (!use_kill) - ac_build_endif(&ctx->ac, 6505); -} - -/* Cut one primitive from the geometry shader */ -static void si_llvm_emit_primitive(struct ac_shader_abi *abi, - unsigned stream) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - if (ctx->shader->key.as_ngg) { - LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); - return; - } - - /* Signal primitive cut */ - ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), - si_get_gs_wave_id(ctx)); -} - static void declare_streamout_params(struct si_shader_context *ctx, struct pipe_stream_output_info *so) { @@ -1708,7 +1355,7 @@ void si_add_arg_checked(struct ac_shader_args *args, ac_add_arg(args, file, registers, type, arg); } -static void create_function(struct si_shader_context *ctx) +void si_create_function(struct si_shader_context *ctx) { struct si_shader *shader = ctx->shader; LLVMTypeRef returns[AC_MAX_ARGS]; @@ -2106,144 +1753,6 @@ static void create_function(struct si_shader_context *ctx) } } -/* Ensure that the esgs ring is declared. - * - * We declare it with 64KB alignment as a hint that the - * pointer value will always be 0. - */ -static void declare_esgs_ring(struct si_shader_context *ctx) -{ - if (ctx->esgs_ring) - return; - - assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); - - ctx->esgs_ring = LLVMAddGlobalInAddressSpace( - ctx->ac.module, LLVMArrayType(ctx->i32, 0), - "esgs_ring", - AC_ADDR_SPACE_LDS); - LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); - LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); -} - -/** - * Load ESGS and GSVS ring buffer resource descriptors and save the variables - * for later use. - */ -static void preload_ring_buffers(struct si_shader_context *ctx) -{ - LLVMBuilderRef builder = ctx->ac.builder; - - LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - - if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) { - if (ctx->screen->info.chip_class <= GFX8) { - unsigned ring = - ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS - : SI_ES_RING_ESGS; - LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); - - ctx->esgs_ring = - ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - } else { - if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { - /* Declare the ESGS ring as an explicit LDS symbol. */ - declare_esgs_ring(ctx); - } else { - ac_declare_lds_as_pointer(&ctx->ac); - ctx->esgs_ring = ctx->ac.lds; - } - } - } - - if (ctx->shader->is_gs_copy_shader) { - LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); - - ctx->gsvs_ring[0] = - ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - } else if (ctx->type == PIPE_SHADER_GEOMETRY) { - const struct si_shader_selector *sel = ctx->shader->selector; - LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); - LLVMValueRef base_ring; - - base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - - /* The conceptual layout of the GSVS ring is - * v0c0 .. vLv0 v0c1 .. vLc1 .. - * but the real memory layout is swizzled across - * threads: - * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL - * t16v0c0 .. - * Override the buffer descriptor accordingly. - */ - LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); - uint64_t stream_offset = 0; - - for (unsigned stream = 0; stream < 4; ++stream) { - unsigned num_components; - unsigned stride; - unsigned num_records; - LLVMValueRef ring, tmp; - - num_components = sel->info.num_stream_output_components[stream]; - if (!num_components) - continue; - - stride = 4 * num_components * sel->gs_max_out_vertices; - - /* Limit on the stride field for <= GFX7. */ - assert(stride < (1 << 14)); - - num_records = ctx->ac.wave_size; - - ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); - tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); - tmp = LLVMBuildAdd(builder, tmp, - LLVMConstInt(ctx->i64, - stream_offset, 0), ""); - stream_offset += stride * ctx->ac.wave_size; - - ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); - ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); - tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, ""); - tmp = LLVMBuildOr(builder, tmp, - LLVMConstInt(ctx->i32, - S_008F04_STRIDE(stride) | - S_008F04_SWIZZLE_ENABLE(1), 0), ""); - ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, ""); - ring = LLVMBuildInsertElement(builder, ring, - LLVMConstInt(ctx->i32, num_records, 0), - LLVMConstInt(ctx->i32, 2, 0), ""); - - uint32_t rsrc3 = - S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | - S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | - S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ - S_008F0C_ADD_TID_ENABLE(1); - - if (ctx->ac.chip_class >= GFX10) { - rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | - S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */ - } - - ring = LLVMBuildInsertElement(builder, ring, - LLVMConstInt(ctx->i32, rsrc3, false), - LLVMConstInt(ctx->i32, 3, 0), ""); - - ctx->gsvs_ring[stream] = ring; - } - } else if (ctx->type == PIPE_SHADER_TESS_EVAL) { - si_llvm_preload_tes_rings(ctx); - } -} - /* For the UMR disassembler. */ #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ #define DEBUGGER_NUM_MARKERS 5 @@ -2656,16 +2165,16 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, si_shader_dump_stats(sscreen, shader, file, check_debug_option); } -static int si_compile_llvm(struct si_screen *sscreen, - struct si_shader_binary *binary, - struct ac_shader_config *conf, - struct ac_llvm_compiler *compiler, - LLVMModuleRef mod, - struct pipe_debug_callback *debug, - enum pipe_shader_type shader_type, - unsigned wave_size, - const char *name, - bool less_optimized) +int si_compile_llvm(struct si_screen *sscreen, + struct si_shader_binary *binary, + struct ac_shader_config *conf, + struct ac_llvm_compiler *compiler, + LLVMModuleRef mod, + struct pipe_debug_callback *debug, + enum pipe_shader_type shader_type, + unsigned wave_size, + const char *name, + bool less_optimized) { unsigned count = p_atomic_inc_return(&sscreen->num_compilations); @@ -2724,155 +2233,6 @@ static int si_compile_llvm(struct si_screen *sscreen, return 0; } -/* Generate code for the hardware VS shader stage to go with a geometry shader */ -struct si_shader * -si_generate_gs_copy_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader_selector *gs_selector, - struct pipe_debug_callback *debug) -{ - struct si_shader_context ctx; - struct si_shader *shader; - LLVMBuilderRef builder; - struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; - struct si_shader_info *gsinfo = &gs_selector->info; - int i; - - - shader = CALLOC_STRUCT(si_shader); - if (!shader) - return NULL; - - /* We can leave the fence as permanently signaled because the GS copy - * shader only becomes visible globally after it has been compiled. */ - util_queue_fence_init(&shader->ready); - - shader->selector = gs_selector; - shader->is_gs_copy_shader = true; - - si_llvm_context_init(&ctx, sscreen, compiler, - si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false)); - ctx.shader = shader; - ctx.type = PIPE_SHADER_VERTEX; - - builder = ctx.ac.builder; - - create_function(&ctx); - preload_ring_buffers(&ctx); - - LLVMValueRef voffset = - LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, - LLVMConstInt(ctx.i32, 4, 0), ""); - - /* Fetch the vertex stream ID.*/ - LLVMValueRef stream_id; - - if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) - stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2); - else - stream_id = ctx.i32_0; - - /* Fill in output information. */ - for (i = 0; i < gsinfo->num_outputs; ++i) { - outputs[i].semantic_name = gsinfo->output_semantic_name[i]; - outputs[i].semantic_index = gsinfo->output_semantic_index[i]; - - for (int chan = 0; chan < 4; chan++) { - outputs[i].vertex_stream[chan] = - (gsinfo->output_streams[i] >> (2 * chan)) & 3; - } - } - - LLVMBasicBlockRef end_bb; - LLVMValueRef switch_inst; - - end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); - switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); - - for (int stream = 0; stream < 4; stream++) { - LLVMBasicBlockRef bb; - unsigned offset; - - if (!gsinfo->num_stream_output_components[stream]) - continue; - - if (stream > 0 && !gs_selector->so.num_outputs) - continue; - - bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); - LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb); - LLVMPositionBuilderAtEnd(builder, bb); - - /* Fetch vertex data from GSVS ring */ - offset = 0; - for (i = 0; i < gsinfo->num_outputs; ++i) { - for (unsigned chan = 0; chan < 4; chan++) { - if (!(gsinfo->output_usagemask[i] & (1 << chan)) || - outputs[i].vertex_stream[chan] != stream) { - outputs[i].values[chan] = LLVMGetUndef(ctx.f32); - continue; - } - - LLVMValueRef soffset = LLVMConstInt(ctx.i32, - offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); - offset++; - - outputs[i].values[chan] = - ac_build_buffer_load(&ctx.ac, - ctx.gsvs_ring[0], 1, - ctx.i32_0, voffset, - soffset, 0, ac_glc | ac_slc, - true, false); - } - } - - /* Streamout and exports. */ - if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { - si_llvm_emit_streamout(&ctx, outputs, - gsinfo->num_outputs, - stream); - } - - if (stream == 0) - si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); - - LLVMBuildBr(builder, end_bb); - } - - LLVMPositionBuilderAtEnd(builder, end_bb); - - LLVMBuildRetVoid(ctx.ac.builder); - - ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ - si_llvm_optimize_module(&ctx); - - bool ok = false; - if (si_compile_llvm(sscreen, &ctx.shader->binary, - &ctx.shader->config, ctx.compiler, - ctx.ac.module, - debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size, - "GS Copy Shader", false) == 0) { - if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) - fprintf(stderr, "GS Copy Shader:\n"); - si_shader_dump(sscreen, ctx.shader, debug, stderr, true); - - if (!ctx.shader->config.scratch_bytes_per_wave) - ok = si_shader_binary_upload(sscreen, ctx.shader, 0); - else - ok = true; - } - - si_llvm_dispose(&ctx); - - if (!ok) { - FREE(shader); - shader = NULL; - } else { - si_fix_resource_usage(sscreen, shader); - } - return shader; -} - static void si_dump_shader_key_vs(const struct si_shader_key *key, const struct si_vs_prolog_bits *prolog, const char *prefix, FILE *f) @@ -3052,22 +2412,6 @@ static bool si_vs_needs_prolog(const struct si_shader_selector *sel, key->unpack_instance_id_from_vertex_id; } -LLVMValueRef si_is_es_thread(struct si_shader_context *ctx) -{ - /* Return true if the current thread should execute an ES thread. */ - return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), - si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), ""); -} - -LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx) -{ - /* Return true if the current thread should execute a GS thread. */ - return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, - ac_get_thread_id(&ctx->ac), - si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), ""); -} - static bool si_build_main_function(struct si_shader_context *ctx, struct nir_shader *nir, bool free_nir) { @@ -3102,10 +2446,7 @@ static bool si_build_main_function(struct si_shader_context *ctx, ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; break; case PIPE_SHADER_GEOMETRY: - ctx->abi.load_inputs = si_nir_load_input_gs; - ctx->abi.emit_vertex = si_llvm_emit_vertex; - ctx->abi.emit_primitive = si_llvm_emit_primitive; - ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; + si_llvm_init_gs_callbacks(ctx); break; case PIPE_SHADER_FRAGMENT: si_llvm_init_ps_callbacks(ctx); @@ -3121,8 +2462,15 @@ static bool si_build_main_function(struct si_shader_context *ctx, ctx->abi.load_ubo = load_ubo; ctx->abi.load_ssbo = load_ssbo; - create_function(ctx); - preload_ring_buffers(ctx); + si_create_function(ctx); + + if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) + si_preload_esgs_ring(ctx); + + if (ctx->type == PIPE_SHADER_GEOMETRY) + si_preload_gs_rings(ctx); + else if (ctx->type == PIPE_SHADER_TESS_EVAL) + si_llvm_preload_tes_rings(ctx); if (ctx->type == PIPE_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) { @@ -3172,7 +2520,7 @@ static bool si_build_main_function(struct si_shader_context *ctx, * avoids bank conflicts for SoA accesses. */ if (!gfx10_is_ngg_passthrough(shader)) - declare_esgs_ring(ctx); + si_llvm_declare_esgs_ring(ctx); /* This is really only needed when streamout and / or vertex * compaction is enabled. @@ -3324,129 +2672,6 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info, shader_out->info.uses_instanceid = true; } -/** - * Build the GS prolog function. Rotate the input vertices for triangle strips - * with adjacency. - */ -static void si_build_gs_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) -{ - unsigned num_sgprs, num_vgprs; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMTypeRef returns[AC_MAX_ARGS]; - LLVMValueRef func, ret; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - if (ctx->screen->info.chip_class >= GFX9) { - if (key->gs_prolog.states.gfx9_prev_is_vs) - num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; - else - num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; - num_vgprs = 5; /* ES inputs are not needed by GS */ - } else { - num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; - num_vgprs = 8; - } - - for (unsigned i = 0; i < num_sgprs; ++i) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); - returns[i] = ctx->i32; - } - - for (unsigned i = 0; i < num_vgprs; ++i) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); - returns[num_sgprs + i] = ctx->f32; - } - - /* Create the function. */ - si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0); - func = ctx->main_fn; - - /* Set the full EXEC mask for the prolog, because we are only fiddling - * with registers here. The main shader part will set the correct EXEC - * mask. - */ - if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) - ac_init_exec_full_mask(&ctx->ac); - - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (unsigned i = 0; i < num_sgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(builder, ret, p, i, ""); - } - for (unsigned i = 0; i < num_vgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); - p = ac_to_float(&ctx->ac, p); - ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); - } - - if (key->gs_prolog.states.tri_strip_adj_fix) { - /* Remap the input vertices for every other primitive. */ - const struct ac_arg gfx6_vtx_params[6] = { - { .used = true, .arg_index = num_sgprs }, - { .used = true, .arg_index = num_sgprs + 1 }, - { .used = true, .arg_index = num_sgprs + 3 }, - { .used = true, .arg_index = num_sgprs + 4 }, - { .used = true, .arg_index = num_sgprs + 5 }, - { .used = true, .arg_index = num_sgprs + 6 }, - }; - const struct ac_arg gfx9_vtx_params[3] = { - { .used = true, .arg_index = num_sgprs }, - { .used = true, .arg_index = num_sgprs + 1 }, - { .used = true, .arg_index = num_sgprs + 4 }, - }; - LLVMValueRef vtx_in[6], vtx_out[6]; - LLVMValueRef prim_id, rotate; - - if (ctx->screen->info.chip_class >= GFX9) { - for (unsigned i = 0; i < 3; i++) { - vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); - vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); - } - } else { - for (unsigned i = 0; i < 6; i++) - vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]); - } - - prim_id = LLVMGetParam(func, num_sgprs + 2); - rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); - - for (unsigned i = 0; i < 6; ++i) { - LLVMValueRef base, rotated; - base = vtx_in[i]; - rotated = vtx_in[(i + 4) % 6]; - vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); - } - - if (ctx->screen->info.chip_class >= GFX9) { - for (unsigned i = 0; i < 3; i++) { - LLVMValueRef hi, out; - - hi = LLVMBuildShl(builder, vtx_out[i*2+1], - LLVMConstInt(ctx->i32, 16, 0), ""); - out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); - out = ac_to_float(&ctx->ac, out); - ret = LLVMBuildInsertValue(builder, ret, out, - gfx9_vtx_params[i].arg_index, ""); - } - } else { - for (unsigned i = 0; i < 6; i++) { - LLVMValueRef out; - - out = ac_to_float(&ctx->ac, vtx_out[i]); - ret = LLVMBuildInsertValue(builder, ret, out, - gfx6_vtx_params[i].arg_index, ""); - } - } - } - - LLVMBuildRet(builder, ret); -} - /** * Given a list of shader part functions, build a wrapper function that * runs them in sequence to form a monolithic shader. @@ -3900,7 +3125,7 @@ int si_compile_shader(struct si_screen *sscreen, gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; gs_prolog_key.gs_prolog.is_monolithic = true; gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; - si_build_gs_prolog_function(&ctx, &gs_prolog_key); + si_llvm_build_gs_prolog(&ctx, &gs_prolog_key); gs_prolog = ctx.main_fn; /* ES main part */ @@ -3959,7 +3184,7 @@ int si_compile_shader(struct si_screen *sscreen, memset(&prolog_key, 0, sizeof(prolog_key)); prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - si_build_gs_prolog_function(&ctx, &prolog_key); + si_llvm_build_gs_prolog(&ctx, &prolog_key); parts[0] = ctx.main_fn; si_build_wrapper_function(&ctx, parts, 2, 1, 0); @@ -4431,7 +3656,7 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen, shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs, PIPE_SHADER_GEOMETRY, true, &prolog_key, compiler, debug, - si_build_gs_prolog_function, + si_llvm_build_gs_prolog, "Geometry Shader Prolog"); return shader->prolog2 != NULL; } @@ -4722,8 +3947,7 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen, *lds_size = MAX2(*lds_size, 8); } -static void si_fix_resource_usage(struct si_screen *sscreen, - struct si_shader *shader) +void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) { unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 7a26083eb49..cfddda02128 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -814,11 +814,6 @@ struct si_shader_part { }; /* si_shader.c */ -struct si_shader * -si_generate_gs_copy_shader(struct si_screen *sscreen, - struct ac_llvm_compiler *compiler, - struct si_shader_selector *gs_selector, - struct pipe_debug_callback *debug); int si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, @@ -844,6 +839,13 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen, const char *si_get_shader_name(const struct si_shader *shader); void si_shader_binary_clean(struct si_shader_binary *binary); +/* si_shader_llvm_gs.c */ +struct si_shader * +si_generate_gs_copy_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader_selector *gs_selector, + struct pipe_debug_callback *debug); + /* si_shader_nir.c */ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info); diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index dd3dafe77bf..8279a6826b2 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -260,6 +260,7 @@ LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type, LLVMValueRef val1, LLVMValueRef val2); void si_llvm_emit_barrier(struct si_shader_context *ctx); +void si_llvm_declare_esgs_ring(struct si_shader_context *ctx); void si_declare_compute_memory(struct si_shader_context *ctx); LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle); @@ -287,8 +288,6 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir); LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift, unsigned bitwidth); -LLVMValueRef si_is_es_thread(struct si_shader_context *ctx); -LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx); void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, unsigned num_parts, unsigned main_part, unsigned next_shader_first_part); @@ -304,6 +303,21 @@ LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueR struct ac_arg param, unsigned return_index); LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, struct ac_arg param, unsigned return_index); +int si_compile_llvm(struct si_screen *sscreen, + struct si_shader_binary *binary, + struct ac_shader_config *conf, + struct ac_llvm_compiler *compiler, + LLVMModuleRef mod, + struct pipe_debug_callback *debug, + enum pipe_shader_type shader_type, + unsigned wave_size, + const char *name, + bool less_optimized); +void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader); +void si_llvm_emit_streamout(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream); +void si_create_function(struct si_shader_context *ctx); void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, @@ -315,6 +329,17 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx); void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx); void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); +/* si_shader_llvm_gs.c */ +LLVMValueRef si_is_es_thread(struct si_shader_context *ctx); +LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx); +void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs); +void si_preload_esgs_ring(struct si_shader_context *ctx); +void si_preload_gs_rings(struct si_shader_context *ctx); +void si_llvm_build_gs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key); +void si_llvm_init_gs_callbacks(struct si_shader_context *ctx); + /* si_shader_llvm_tess.c */ void si_llvm_preload_tes_rings(struct si_shader_context *ctx); void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_build.c b/src/gallium/drivers/radeonsi/si_shader_llvm_build.c index 50b02abb45d..8d0a841973c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_build.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_build.c @@ -242,3 +242,23 @@ void si_llvm_emit_barrier(struct si_shader_context *ctx) ac_build_s_barrier(&ctx->ac); } + +/* Ensure that the esgs ring is declared. + * + * We declare it with 64KB alignment as a hint that the + * pointer value will always be 0. + */ +void si_llvm_declare_esgs_ring(struct si_shader_context *ctx) +{ + if (ctx->esgs_ring) + return; + + assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring")); + + ctx->esgs_ring = LLVMAddGlobalInAddressSpace( + ctx->ac.module, LLVMArrayType(ctx->i32, 0), + "esgs_ring", + AC_ADDR_SPACE_LDS); + LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); + LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); +} diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c new file mode 100644 index 00000000000..69dc7080a6c --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -0,0 +1,780 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "sid.h" +#include "util/u_memory.h" + +LLVMValueRef si_is_es_thread(struct si_shader_context *ctx) +{ + /* Return true if the current thread should execute an ES thread. */ + return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), + si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), ""); +} + +LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx) +{ + /* Return true if the current thread should execute a GS thread. */ + return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), + si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), ""); +} + +static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, + unsigned input_index, + unsigned vtx_offset_param, + LLVMTypeRef type, + unsigned swizzle) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *shader = ctx->shader; + LLVMValueRef vtx_offset, soffset; + struct si_shader_info *info = &shader->selector->info; + unsigned semantic_name = info->input_semantic_name[input_index]; + unsigned semantic_index = info->input_semantic_index[input_index]; + unsigned param; + LLVMValueRef value; + + param = si_shader_io_get_unique_index(semantic_name, semantic_index, false); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + unsigned index = vtx_offset_param; + + switch (index / 2) { + case 0: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, + index % 2 ? 16 : 0, 16); + break; + case 1: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, + index % 2 ? 16 : 0, 16); + break; + case 2: + vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, + index % 2 ? 16 : 0, 16); + break; + default: + assert(0); + return NULL; + } + + unsigned offset = param * 4 + swizzle; + vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, + LLVMConstInt(ctx->i32, offset, false), ""); + + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + if (ac_get_type_size(type) == 64) { + ptr = LLVMBuildGEP(ctx->ac.builder, ptr, + &ctx->ac.i32_1, 1, ""); + LLVMValueRef values[2] = { + value, + LLVMBuildLoad(ctx->ac.builder, ptr, "") + }; + value = ac_build_gather_values(&ctx->ac, values, 2); + } + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); + } + + /* GFX6: input load from the ESGS ring in memory. */ + if (swizzle == ~0) { + LLVMValueRef values[4]; + unsigned chan; + for (chan = 0; chan < 4; chan++) { + values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, + type, chan); + } + return ac_build_gather_values(&ctx->ac, values, 4); + } + + /* Get the vertex offset parameter on GFX6. */ + LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, + ctx->gs_vtx_offset[vtx_offset_param]); + + vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, + LLVMConstInt(ctx->i32, 4, 0), ""); + + soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0); + + value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0, + vtx_offset, soffset, 0, ac_glc, true, false); + if (ac_get_type_size(type) == 64) { + LLVMValueRef value2; + soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0); + + value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, + ctx->i32_0, vtx_offset, soffset, + 0, ac_glc, true, false); + return si_build_gather_64bit(ctx, type, value, value2); + } + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); +} + +static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, + unsigned location, + unsigned driver_location, + unsigned component, + unsigned num_components, + unsigned vertex_index, + unsigned const_index, + LLVMTypeRef type) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + LLVMValueRef value[4]; + for (unsigned i = 0; i < num_components; i++) { + unsigned offset = i; + if (ac_get_type_size(type) == 64) + offset *= 2; + + offset += component; + value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index, + vertex_index, type, offset); + } + + return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); +} + +/* Pass GS inputs from ES to GS on GFX9. */ +static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) +{ + LLVMValueRef ret = ctx->return_value; + + ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); + ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1); + if (ctx->shader->key.as_ngg) + ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2); + else + ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2); + ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3); + ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5); + + ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, + 8 + SI_SGPR_RW_BUFFERS); + ret = si_insert_input_ptr(ctx, ret, + ctx->bindless_samplers_and_images, + 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); + if (ctx->screen->use_ngg) { + ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, + 8 + SI_SGPR_VS_STATE_BITS); + } + + unsigned vgpr; + if (ctx->type == PIPE_SHADER_VERTEX) + vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR; + + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++); + ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++); + ctx->return_value = ret; +} + +void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader *es = ctx->shader; + struct si_shader_info *info = &es->selector->info; + LLVMValueRef lds_base = NULL; + unsigned chan; + int i; + + if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) { + unsigned itemsize_dw = es->selector->esgs_itemsize / 4; + LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac); + LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4); + vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx, + LLVMBuildMul(ctx->ac.builder, wave_idx, + LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), ""); + lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx, + LLVMConstInt(ctx->i32, itemsize_dw, 0), ""); + } + + for (i = 0; i < info->num_outputs; i++) { + int param; + + if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX || + info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) + continue; + + param = si_shader_io_get_unique_index(info->output_semantic_name[i], + info->output_semantic_index[i], false); + + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan))) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + out_val = ac_to_integer(&ctx->ac, out_val); + + /* GFX9 has the ESGS ring in LDS. */ + if (ctx->screen->info.chip_class >= GFX9) { + LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false); + idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); + ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); + continue; + } + + ac_build_buffer_store_dword(&ctx->ac, + ctx->esgs_ring, + out_val, 1, NULL, + ac_get_arg(&ctx->ac, ctx->es2gs_offset), + (4 * param + chan) * 4, + ac_glc | ac_slc | ac_swizzled); + } + } + + if (ctx->screen->info.chip_class >= GFX9) + si_set_es_return_value_for_gs(ctx); +} + +static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) +{ + if (ctx->screen->info.chip_class >= GFX9) + return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8); + else + return ac_get_arg(&ctx->ac, ctx->gs_wave_id); +} + +static void emit_gs_epilogue(struct si_shader_context *ctx) +{ + if (ctx->shader->key.as_ngg) { + gfx10_ngg_gs_emit_epilogue(ctx); + return; + } + + if (ctx->screen->info.chip_class >= GFX10) + LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); + + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, + si_get_gs_wave_id(ctx)); + + if (ctx->screen->info.chip_class >= GFX9) + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); +} + +static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info UNUSED *info = &ctx->shader->selector->info; + + assert(info->num_outputs <= max_outputs); + + emit_gs_epilogue(ctx); +} + +/* Emit one vertex from the geometry shader */ +static void si_llvm_emit_vertex(struct ac_shader_abi *abi, + unsigned stream, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + if (ctx->shader->key.as_ngg) { + gfx10_ngg_gs_emit_vertex(ctx, stream, addrs); + return; + } + + struct si_shader_info *info = &ctx->shader->selector->info; + struct si_shader *shader = ctx->shader; + LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset); + LLVMValueRef gs_next_vertex; + LLVMValueRef can_emit; + unsigned chan, offset; + int i; + + /* Write vertex attribute values to GSVS ring */ + gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, + ctx->gs_next_vertex[stream], + ""); + + /* If this thread has already emitted the declared maximum number of + * vertices, skip the write: excessive vertex emissions are not + * supposed to have any effect. + * + * If the shader has no writes to memory, kill it instead. This skips + * further memory loads and may allow LLVM to skip to the end + * altogether. + */ + can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex, + LLVMConstInt(ctx->i32, + shader->selector->gs_max_out_vertices, 0), ""); + + bool use_kill = !info->writes_memory; + if (use_kill) { + ac_build_kill_if_false(&ctx->ac, can_emit); + } else { + ac_build_ifcc(&ctx->ac, can_emit, 6505); + } + + offset = 0; + for (i = 0; i < info->num_outputs; i++) { + for (chan = 0; chan < 4; chan++) { + if (!(info->output_usagemask[i] & (1 << chan)) || + ((info->output_streams[i] >> (2 * chan)) & 3) != stream) + continue; + + LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + LLVMValueRef voffset = + LLVMConstInt(ctx->i32, offset * + shader->selector->gs_max_out_vertices, 0); + offset++; + + voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, ""); + voffset = LLVMBuildMul(ctx->ac.builder, voffset, + LLVMConstInt(ctx->i32, 4, 0), ""); + + out_val = ac_to_integer(&ctx->ac, out_val); + + ac_build_buffer_store_dword(&ctx->ac, + ctx->gsvs_ring[stream], + out_val, 1, + voffset, soffset, 0, + ac_glc | ac_slc | ac_swizzled); + } + } + + gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, ""); + LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]); + + /* Signal vertex emission if vertex data was written. */ + if (offset) { + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), + si_get_gs_wave_id(ctx)); + } + + if (!use_kill) + ac_build_endif(&ctx->ac, 6505); +} + +/* Cut one primitive from the geometry shader */ +static void si_llvm_emit_primitive(struct ac_shader_abi *abi, + unsigned stream) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + if (ctx->shader->key.as_ngg) { + LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]); + return; + } + + /* Signal primitive cut */ + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), + si_get_gs_wave_id(ctx)); +} + +void si_preload_esgs_ring(struct si_shader_context *ctx) +{ + if (ctx->screen->info.chip_class <= GFX8) { + unsigned ring = + ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS + : SI_ES_RING_ESGS; + LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + + ctx->esgs_ring = + ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + } else { + if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) { + /* Declare the ESGS ring as an explicit LDS symbol. */ + si_llvm_declare_esgs_ring(ctx); + } else { + ac_declare_lds_as_pointer(&ctx->ac); + ctx->esgs_ring = ctx->ac.lds; + } + } +} + +void si_preload_gs_rings(struct si_shader_context *ctx) +{ + const struct si_shader_selector *sel = ctx->shader->selector; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + + /* The conceptual layout of the GSVS ring is + * v0c0 .. vLv0 v0c1 .. vLc1 .. + * but the real memory layout is swizzled across + * threads: + * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL + * t16v0c0 .. + * Override the buffer descriptor accordingly. + */ + LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2); + uint64_t stream_offset = 0; + + for (unsigned stream = 0; stream < 4; ++stream) { + unsigned num_components; + unsigned stride; + unsigned num_records; + LLVMValueRef ring, tmp; + + num_components = sel->info.num_stream_output_components[stream]; + if (!num_components) + continue; + + stride = 4 * num_components * sel->gs_max_out_vertices; + + /* Limit on the stride field for <= GFX7. */ + assert(stride < (1 << 14)); + + num_records = ctx->ac.wave_size; + + ring = LLVMBuildBitCast(builder, base_ring, v2i64, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, ""); + tmp = LLVMBuildAdd(builder, tmp, + LLVMConstInt(ctx->i64, + stream_offset, 0), ""); + stream_offset += stride * ctx->ac.wave_size; + + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, ""); + ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, ""); + tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, ""); + tmp = LLVMBuildOr(builder, tmp, + LLVMConstInt(ctx->i32, + S_008F04_STRIDE(stride) | + S_008F04_SWIZZLE_ENABLE(1), 0), ""); + ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, ""); + ring = LLVMBuildInsertElement(builder, ring, + LLVMConstInt(ctx->i32, num_records, 0), + LLVMConstInt(ctx->i32, 2, 0), ""); + + uint32_t rsrc3 = + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */ + S_008F0C_ADD_TID_ENABLE(1); + + if (ctx->ac.chip_class >= GFX10) { + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | + S_008F0C_RESOURCE_LEVEL(1); + } else { + rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | + S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */ + } + + ring = LLVMBuildInsertElement(builder, ring, + LLVMConstInt(ctx->i32, rsrc3, false), + LLVMConstInt(ctx->i32, 3, 0), ""); + + ctx->gsvs_ring[stream] = ring; + } +} + +/* Generate code for the hardware VS shader stage to go with a geometry shader */ +struct si_shader * +si_generate_gs_copy_shader(struct si_screen *sscreen, + struct ac_llvm_compiler *compiler, + struct si_shader_selector *gs_selector, + struct pipe_debug_callback *debug) +{ + struct si_shader_context ctx; + struct si_shader *shader; + LLVMBuilderRef builder; + struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS]; + struct si_shader_info *gsinfo = &gs_selector->info; + int i; + + + shader = CALLOC_STRUCT(si_shader); + if (!shader) + return NULL; + + /* We can leave the fence as permanently signaled because the GS copy + * shader only becomes visible globally after it has been compiled. */ + util_queue_fence_init(&shader->ready); + + shader->selector = gs_selector; + shader->is_gs_copy_shader = true; + + si_llvm_context_init(&ctx, sscreen, compiler, + si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false)); + ctx.shader = shader; + ctx.type = PIPE_SHADER_VERTEX; + + builder = ctx.ac.builder; + + si_create_function(&ctx); + + LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers); + ctx.gsvs_ring[0] = ac_build_load_to_sgpr(&ctx.ac, buf_ptr, + LLVMConstInt(ctx.i32, SI_RING_GSVS, 0)); + + LLVMValueRef voffset = + LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, + LLVMConstInt(ctx.i32, 4, 0), ""); + + /* Fetch the vertex stream ID.*/ + LLVMValueRef stream_id; + + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) + stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2); + else + stream_id = ctx.i32_0; + + /* Fill in output information. */ + for (i = 0; i < gsinfo->num_outputs; ++i) { + outputs[i].semantic_name = gsinfo->output_semantic_name[i]; + outputs[i].semantic_index = gsinfo->output_semantic_index[i]; + + for (int chan = 0; chan < 4; chan++) { + outputs[i].vertex_stream[chan] = + (gsinfo->output_streams[i] >> (2 * chan)) & 3; + } + } + + LLVMBasicBlockRef end_bb; + LLVMValueRef switch_inst; + + end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end"); + switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4); + + for (int stream = 0; stream < 4; stream++) { + LLVMBasicBlockRef bb; + unsigned offset; + + if (!gsinfo->num_stream_output_components[stream]) + continue; + + if (stream > 0 && !gs_selector->so.num_outputs) + continue; + + bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out"); + LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb); + LLVMPositionBuilderAtEnd(builder, bb); + + /* Fetch vertex data from GSVS ring */ + offset = 0; + for (i = 0; i < gsinfo->num_outputs; ++i) { + for (unsigned chan = 0; chan < 4; chan++) { + if (!(gsinfo->output_usagemask[i] & (1 << chan)) || + outputs[i].vertex_stream[chan] != stream) { + outputs[i].values[chan] = LLVMGetUndef(ctx.f32); + continue; + } + + LLVMValueRef soffset = LLVMConstInt(ctx.i32, + offset * gs_selector->gs_max_out_vertices * 16 * 4, 0); + offset++; + + outputs[i].values[chan] = + ac_build_buffer_load(&ctx.ac, + ctx.gsvs_ring[0], 1, + ctx.i32_0, voffset, + soffset, 0, ac_glc | ac_slc, + true, false); + } + } + + /* Streamout and exports. */ + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { + si_llvm_emit_streamout(&ctx, outputs, + gsinfo->num_outputs, + stream); + } + + if (stream == 0) + si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); + + LLVMBuildBr(builder, end_bb); + } + + LLVMPositionBuilderAtEnd(builder, end_bb); + + LLVMBuildRetVoid(ctx.ac.builder); + + ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */ + si_llvm_optimize_module(&ctx); + + bool ok = false; + if (si_compile_llvm(sscreen, &ctx.shader->binary, + &ctx.shader->config, ctx.compiler, + ctx.ac.module, + debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size, + "GS Copy Shader", false) == 0) { + if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) + fprintf(stderr, "GS Copy Shader:\n"); + si_shader_dump(sscreen, ctx.shader, debug, stderr, true); + + if (!ctx.shader->config.scratch_bytes_per_wave) + ok = si_shader_binary_upload(sscreen, ctx.shader, 0); + else + ok = true; + } + + si_llvm_dispose(&ctx); + + if (!ok) { + FREE(shader); + shader = NULL; + } else { + si_fix_resource_usage(sscreen, shader); + } + return shader; +} + +/** + * Build the GS prolog function. Rotate the input vertices for triangle strips + * with adjacency. + */ +void si_llvm_build_gs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + unsigned num_sgprs, num_vgprs; + LLVMBuilderRef builder = ctx->ac.builder; + LLVMTypeRef returns[AC_MAX_ARGS]; + LLVMValueRef func, ret; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + if (ctx->screen->info.chip_class >= GFX9) { + if (key->gs_prolog.states.gfx9_prev_is_vs) + num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR; + else + num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + + for (unsigned i = 0; i < num_sgprs; ++i) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + returns[i] = ctx->i32; + } + + for (unsigned i = 0; i < num_vgprs; ++i) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); + returns[num_sgprs + i] = ctx->f32; + } + + /* Create the function. */ + si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0); + func = ctx->main_fn; + + /* Set the full EXEC mask for the prolog, because we are only fiddling + * with registers here. The main shader part will set the correct EXEC + * mask. + */ + if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) + ac_init_exec_full_mask(&ctx->ac); + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (unsigned i = 0; i < num_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(builder, ret, p, i, ""); + } + for (unsigned i = 0; i < num_vgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); + p = ac_to_float(&ctx->ac, p); + ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); + } + + if (key->gs_prolog.states.tri_strip_adj_fix) { + /* Remap the input vertices for every other primitive. */ + const struct ac_arg gfx6_vtx_params[6] = { + { .used = true, .arg_index = num_sgprs }, + { .used = true, .arg_index = num_sgprs + 1 }, + { .used = true, .arg_index = num_sgprs + 3 }, + { .used = true, .arg_index = num_sgprs + 4 }, + { .used = true, .arg_index = num_sgprs + 5 }, + { .used = true, .arg_index = num_sgprs + 6 }, + }; + const struct ac_arg gfx9_vtx_params[3] = { + { .used = true, .arg_index = num_sgprs }, + { .used = true, .arg_index = num_sgprs + 1 }, + { .used = true, .arg_index = num_sgprs + 4 }, + }; + LLVMValueRef vtx_in[6], vtx_out[6]; + LLVMValueRef prim_id, rotate; + + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16); + vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16); + } + } else { + for (unsigned i = 0; i < 6; i++) + vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]); + } + + prim_id = LLVMGetParam(func, num_sgprs + 2); + rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); + + for (unsigned i = 0; i < 6; ++i) { + LLVMValueRef base, rotated; + base = vtx_in[i]; + rotated = vtx_in[(i + 4) % 6]; + vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, ""); + } + + if (ctx->screen->info.chip_class >= GFX9) { + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef hi, out; + + hi = LLVMBuildShl(builder, vtx_out[i*2+1], + LLVMConstInt(ctx->i32, 16, 0), ""); + out = LLVMBuildOr(builder, vtx_out[i*2], hi, ""); + out = ac_to_float(&ctx->ac, out); + ret = LLVMBuildInsertValue(builder, ret, out, + gfx9_vtx_params[i].arg_index, ""); + } + } else { + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef out; + + out = ac_to_float(&ctx->ac, vtx_out[i]); + ret = LLVMBuildInsertValue(builder, ret, out, + gfx6_vtx_params[i].arg_index, ""); + } + } + } + + LLVMBuildRet(builder, ret); +} + +void si_llvm_init_gs_callbacks(struct si_shader_context *ctx) +{ + ctx->abi.load_inputs = si_nir_load_input_gs; + ctx->abi.emit_vertex = si_llvm_emit_vertex; + ctx->abi.emit_primitive = si_llvm_emit_primitive; + ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue; +}