From cd5b99c541d241df51cae35d75f502fcfbd179ce Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 15 Jan 2020 18:01:19 -0500 Subject: [PATCH] radeonsi: move VS shader code into si_shader_llvm_vs.c Reviewed-by: Timothy Arceri Part-of: --- src/gallium/drivers/radeonsi/Makefile.sources | 1 + .../drivers/radeonsi/gfx10_shader_ngg.c | 6 +- src/gallium/drivers/radeonsi/meson.build | 1 + src/gallium/drivers/radeonsi/si_shader.c | 1093 +--------------- .../drivers/radeonsi/si_shader_internal.h | 40 +- .../drivers/radeonsi/si_shader_llvm_build.c | 12 + .../drivers/radeonsi/si_shader_llvm_gs.c | 2 +- .../drivers/radeonsi/si_shader_llvm_tess.c | 11 +- .../drivers/radeonsi/si_shader_llvm_vs.c | 1130 +++++++++++++++++ src/gallium/drivers/radeonsi/si_shader_nir.c | 51 +- 10 files changed, 1188 insertions(+), 1159 deletions(-) create mode 100644 src/gallium/drivers/radeonsi/si_shader_llvm_vs.c diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index 73e544a1d5d..bc4f9bc2166 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -41,6 +41,7 @@ C_SOURCES := \ si_shader_llvm_ps.c \ si_shader_llvm_resources.c \ si_shader_llvm_tess.c \ + si_shader_llvm_vs.c \ si_shader_nir.c \ si_shaderlib_tgsi.c \ si_state.c \ diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 852842d8059..63439733507 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -252,7 +252,7 @@ static void build_streamout_vertex(struct si_shader_context *ctx, (info->output_streams[reg] >> (2 * comp)) & 3; } - si_emit_streamout_output(ctx, so_buffer, offset, &so->output[i], &out); + si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out); } } @@ -1486,7 +1486,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, i++; } - si_llvm_export_vs(ctx, outputs, i); + si_llvm_build_vs_exports(ctx, outputs, i); } ac_build_endif(&ctx->ac, 6002); } @@ -1970,7 +1970,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) } } - si_llvm_export_vs(ctx, outputs, info->num_outputs); + si_llvm_build_vs_exports(ctx, outputs, info->num_outputs); } ac_build_endif(&ctx->ac, 5145); } diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 5a09c0a923b..16e313e37c1 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -56,6 +56,7 @@ files_libradeonsi = files( 'si_shader_llvm_ps.c', 'si_shader_llvm_resources.c', 'si_shader_llvm_tess.c', + 'si_shader_llvm_vs.c', 'si_shader_nir.c', 'si_shaderlib_tgsi.c', 'si_state.c', diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 9f8be2b7214..24494513fbe 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -43,9 +43,6 @@ static const char scratch_rsrc_dword1_symbol[] = static void si_dump_shader_key(const struct si_shader *shader, FILE *f); -static void si_build_vs_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key); - /** Whether the shader runs as a combination of multiple API shaders */ static bool is_multi_part_shader(struct si_shader_context *ctx) { @@ -180,227 +177,6 @@ LLVMValueRef si_unpack_param(struct si_shader_context *ctx, return unpack_llvm_param(ctx, value, rshift, bitwidth); } -static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, - LLVMValueRef i32, unsigned index) -{ - assert(index <= 1); - - if (index == 1) - return LLVMBuildAShr(ctx->ac.builder, i32, - LLVMConstInt(ctx->ac.i32, 16, 0), ""); - - return LLVMBuildSExt(ctx->ac.builder, - LLVMBuildTrunc(ctx->ac.builder, i32, - ctx->ac.i16, ""), - ctx->ac.i32, ""); -} - -void si_llvm_load_input_vs( - struct si_shader_context *ctx, - unsigned input_index, - LLVMValueRef out[4]) -{ - const struct si_shader_info *info = &ctx->shader->selector->info; - unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; - - if (vs_blit_property) { - LLVMValueRef vertex_id = ctx->abi.vertex_id; - LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, - LLVMIntULE, vertex_id, - ctx->ac.i32_1, ""); - /* Use LLVMIntNE, because we have 3 vertices and only - * the middle one should use y2. - */ - LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, - LLVMIntNE, vertex_id, - ctx->ac.i32_1, ""); - - unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index; - if (input_index == 0) { - /* Position: */ - LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs); - LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 1); - - LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); - LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); - LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); - LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); - - LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, - x1, x2, ""); - LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, - y1, y2, ""); - - out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, ""); - out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, ""); - out[2] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 2); - out[3] = ctx->ac.f32_1; - return; - } - - /* Color or texture coordinates: */ - assert(input_index == 1); - - if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { - for (int i = 0; i < 4; i++) { - out[i] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 3 + i); - } - } else { - assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); - LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 3); - LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 4); - LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 5); - LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 6); - - out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, - x1, x2, ""); - out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, - y1, y2, ""); - out[2] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 7); - out[3] = LLVMGetParam(ctx->main_fn, - param_vs_blit_inputs + 8); - } - return; - } - - unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; - union si_vs_fix_fetch fix_fetch; - LLVMValueRef vb_desc; - LLVMValueRef vertex_index; - LLVMValueRef tmp; - - if (input_index < num_vbos_in_user_sgprs) { - vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]); - } else { - unsigned index= input_index - num_vbos_in_user_sgprs; - vb_desc = ac_build_load_to_sgpr(&ctx->ac, - ac_get_arg(&ctx->ac, ctx->vertex_buffers), - LLVMConstInt(ctx->ac.i32, index, 0)); - } - - vertex_index = LLVMGetParam(ctx->main_fn, - ctx->vertex_index0.arg_index + - input_index); - - /* Use the open-coded implementation for all loads of doubles and - * of dword-sized data that needs fixups. We need to insert conversion - * code anyway, and the amd/common code does it for us. - * - * Note: On LLVM <= 8, we can only open-code formats with - * channel size >= 4 bytes. - */ - bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index); - fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits; - if (opencode || - (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || - (fix_fetch.u.log_size == 2)) { - tmp = ac_build_opencoded_load_format( - &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, - fix_fetch.u.format, fix_fetch.u.reverse, !opencode, - vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true); - for (unsigned i = 0; i < 4; ++i) - out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), ""); - return; - } - - /* Do multiple loads for special formats. */ - unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); - LLVMValueRef fetches[4]; - unsigned num_fetches; - unsigned fetch_stride; - unsigned channels_per_fetch; - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { - num_fetches = MIN2(required_channels, 3); - fetch_stride = 1 << fix_fetch.u.log_size; - channels_per_fetch = 1; - } else { - num_fetches = 1; - fetch_stride = 0; - channels_per_fetch = required_channels; - } - - for (unsigned i = 0; i < num_fetches; ++i) { - LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0); - fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, - channels_per_fetch, 0, true); - } - - if (num_fetches == 1 && channels_per_fetch > 1) { - LLVMValueRef fetch = fetches[0]; - for (unsigned i = 0; i < channels_per_fetch; ++i) { - tmp = LLVMConstInt(ctx->ac.i32, i, false); - fetches[i] = LLVMBuildExtractElement( - ctx->ac.builder, fetch, tmp, ""); - } - num_fetches = channels_per_fetch; - channels_per_fetch = 1; - } - - for (unsigned i = num_fetches; i < 4; ++i) - fetches[i] = LLVMGetUndef(ctx->ac.f32); - - if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && - required_channels == 4) { - if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) - fetches[3] = ctx->ac.i32_1; - else - fetches[3] = ctx->ac.f32_1; - } else if (fix_fetch.u.log_size == 3 && - (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || - fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || - fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && - required_channels == 4) { - /* For 2_10_10_10, the hardware returns an unsigned value; - * convert it to a signed one. - */ - LLVMValueRef tmp = fetches[3]; - LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); - - /* First, recover the sign-extended signed integer value. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) - tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, ""); - else - tmp = ac_to_integer(&ctx->ac, tmp); - - /* For the integer-like cases, do a natural sign extension. - * - * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 - * and happen to contain 0, 1, 2, 3 as the two LSBs of the - * exponent. - */ - tmp = LLVMBuildShl(ctx->ac.builder, tmp, - fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? - LLVMConstInt(ctx->ac.i32, 7, 0) : c30, ""); - tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); - - /* Convert back to the right type. */ - if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { - LLVMValueRef clamp; - LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0); - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); - clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); - tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); - } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { - tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); - } - - fetches[3] = tmp; - } - - for (unsigned i = 0; i < 4; ++i) - out[i] = ac_to_float(&ctx->ac, fetches[i]); -} - LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle) { @@ -422,26 +198,6 @@ LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, } } -static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - /* For non-indexed draws, the base vertex set by the driver - * (for direct draws) or the CP (for indirect draws) is the - * first vertex ID, but GLSL expects 0 to be returned. - */ - LLVMValueRef vs_state = ac_get_arg(&ctx->ac, - ctx->vs_state_bits); - LLVMValueRef indexed; - - indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, ""); - indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, ""); - - return LLVMBuildSelect(ctx->ac.builder, indexed, - ac_get_arg(&ctx->ac, ctx->args.base_vertex), - ctx->ac.i32_0, ""); -} - static LLVMValueRef get_block_size(struct ac_shader_abi *abi) { struct si_shader_context *ctx = si_shader_context_from_abi(abi); @@ -488,62 +244,6 @@ void si_declare_compute_memory(struct si_shader_context *ctx) ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""); } -/* Initialize arguments for the shader export intrinsic */ -static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, - LLVMValueRef *values, - unsigned target, - struct ac_export_args *args) -{ - args->enabled_channels = 0xf; /* writemask - default is 0xf */ - args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */ - args->done = 0; /* Specify whether this is the last export */ - args->target = target; /* Specify the target we are exporting */ - args->compr = false; - - memcpy(&args->out[0], values, sizeof(values[0]) * 4); -} - -static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, - struct ac_export_args *pos, LLVMValueRef *out_elts) -{ - unsigned reg_index; - unsigned chan; - unsigned const_chan; - LLVMValueRef base_elt; - LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); - LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, - SI_VS_CONST_CLIP_PLANES, 0); - LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); - - for (reg_index = 0; reg_index < 2; reg_index ++) { - struct ac_export_args *args = &pos[2 + reg_index]; - - args->out[0] = - args->out[1] = - args->out[2] = - args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f); - - /* Compute dot products of position and user clip plane vectors */ - for (chan = 0; chan < 4; chan++) { - for (const_chan = 0; const_chan < 4; const_chan++) { - LLVMValueRef addr = - LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + - const_chan) * 4, 0); - base_elt = si_buffer_load_const(ctx, const_resource, - addr); - args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, - out_elts[const_chan], args->out[chan]); - } - } - - args->enabled_channels = 0xf; - args->valid_mask = 0; - args->done = 0; - args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; - args->compr = 0; - } -} - static void si_dump_streamout(struct pipe_stream_output_info *so) { unsigned i; @@ -565,498 +265,6 @@ static void si_dump_streamout(struct pipe_stream_output_info *so) } } -void si_emit_streamout_output(struct si_shader_context *ctx, - LLVMValueRef const *so_buffers, - LLVMValueRef const *so_write_offsets, - struct pipe_stream_output *stream_out, - struct si_shader_output_values *shader_out) -{ - unsigned buf_idx = stream_out->output_buffer; - unsigned start = stream_out->start_component; - unsigned num_comps = stream_out->num_components; - LLVMValueRef out[4]; - - assert(num_comps && num_comps <= 4); - if (!num_comps || num_comps > 4) - return; - - /* Load the output as int. */ - for (int j = 0; j < num_comps; j++) { - assert(stream_out->stream == shader_out->vertex_stream[start + j]); - - out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); - } - - /* Pack the output. */ - LLVMValueRef vdata = NULL; - - switch (num_comps) { - case 1: /* as i32 */ - vdata = out[0]; - break; - case 2: /* as v2i32 */ - case 3: /* as v3i32 */ - if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) { - vdata = ac_build_gather_values(&ctx->ac, out, num_comps); - break; - } - /* as v4i32 (aligned to 4) */ - out[3] = LLVMGetUndef(ctx->ac.i32); - /* fall through */ - case 4: /* as v4i32 */ - vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); - break; - } - - ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], - vdata, num_comps, - so_write_offsets[buf_idx], - ctx->ac.i32_0, - stream_out->dst_offset * 4, ac_glc | ac_slc); -} - -/** - * Write streamout data to buffers for vertex stream @p stream (different - * vertex streams can occur for GS copy shaders). - */ -void si_llvm_emit_streamout(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput, unsigned stream) -{ - struct si_shader_selector *sel = ctx->shader->selector; - struct pipe_stream_output_info *so = &sel->so; - LLVMBuilderRef builder = ctx->ac.builder; - int i; - - /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ - LLVMValueRef so_vtx_count = - si_unpack_param(ctx, ctx->streamout_config, 16, 7); - - LLVMValueRef tid = ac_get_thread_id(&ctx->ac); - - /* can_emit = tid < so_vtx_count; */ - LLVMValueRef can_emit = - LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); - - /* Emit the streamout code conditionally. This actually avoids - * out-of-bounds buffer access. The hw tells us via the SGPR - * (so_vtx_count) which threads are allowed to emit streamout data. */ - ac_build_ifcc(&ctx->ac, can_emit, 6501); - { - /* The buffer offset is computed as follows: - * ByteOffset = streamout_offset[buffer_id]*4 + - * (streamout_write_index + thread_id)*stride[buffer_id] + - * attrib_offset - */ - - LLVMValueRef so_write_index = - ac_get_arg(&ctx->ac, - ctx->streamout_write_index); - - /* Compute (streamout_write_index + thread_id). */ - so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); - - /* Load the descriptor and compute the write offset for each - * enabled buffer. */ - LLVMValueRef so_write_offset[4] = {}; - LLVMValueRef so_buffers[4]; - LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, - ctx->rw_buffers); - - for (i = 0; i < 4; i++) { - if (!so->stride[i]) - continue; - - LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, - SI_VS_STREAMOUT_BUF0 + i, 0); - - so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); - - LLVMValueRef so_offset = ac_get_arg(&ctx->ac, - ctx->streamout_offset[i]); - so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); - - so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, - LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0), - so_offset); - } - - /* Write streamout data. */ - for (i = 0; i < so->num_outputs; i++) { - unsigned reg = so->output[i].register_index; - - if (reg >= noutput) - continue; - - if (stream != so->output[i].stream) - continue; - - si_emit_streamout_output(ctx, so_buffers, so_write_offset, - &so->output[i], &outputs[reg]); - } - } - ac_build_endif(&ctx->ac, 6501); -} - -static void si_export_param(struct si_shader_context *ctx, unsigned index, - LLVMValueRef *values) -{ - struct ac_export_args args; - - si_llvm_init_vs_export_args(ctx, values, - V_008DFC_SQ_EXP_PARAM + index, &args); - ac_build_export(&ctx->ac, &args); -} - -static void si_build_param_exports(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) -{ - struct si_shader *shader = ctx->shader; - unsigned param_count = 0; - - for (unsigned i = 0; i < noutput; i++) { - unsigned semantic_name = outputs[i].semantic_name; - unsigned semantic_index = outputs[i].semantic_index; - - if (outputs[i].vertex_stream[0] != 0 && - outputs[i].vertex_stream[1] != 0 && - outputs[i].vertex_stream[2] != 0 && - outputs[i].vertex_stream[3] != 0) - continue; - - switch (semantic_name) { - case TGSI_SEMANTIC_LAYER: - case TGSI_SEMANTIC_VIEWPORT_INDEX: - case TGSI_SEMANTIC_CLIPDIST: - case TGSI_SEMANTIC_COLOR: - case TGSI_SEMANTIC_BCOLOR: - case TGSI_SEMANTIC_PRIMID: - case TGSI_SEMANTIC_FOG: - case TGSI_SEMANTIC_TEXCOORD: - case TGSI_SEMANTIC_GENERIC: - break; - default: - continue; - } - - if ((semantic_name != TGSI_SEMANTIC_GENERIC || - semantic_index < SI_MAX_IO_GENERIC) && - shader->key.opt.kill_outputs & - (1ull << si_shader_io_get_unique_index(semantic_name, - semantic_index, true))) - continue; - - si_export_param(ctx, param_count, outputs[i].values); - - assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[i] = param_count++; - } - - shader->info.nr_param_exports = param_count; -} - -/** - * Vertex color clamping. - * - * This uses a state constant loaded in a user data SGPR and - * an IF statement is added that clamps all colors if the constant - * is true. - */ -static void si_vertex_color_clamping(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) -{ - LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4]; - bool has_colors = false; - - /* Store original colors to alloca variables. */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); - LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]); - } - has_colors = true; - } - - if (!has_colors) - return; - - /* The state is in the first bit of the user SGPR. */ - LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits); - cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); - - ac_build_ifcc(&ctx->ac, cond, 6502); - - /* Store clamped colors to alloca variables within the conditional block. */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - LLVMBuildStore(ctx->ac.builder, - ac_build_clamp(&ctx->ac, outputs[i].values[j]), - addr[i][j]); - } - } - ac_build_endif(&ctx->ac, 6502); - - /* Load clamped colors */ - for (unsigned i = 0; i < noutput; i++) { - if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && - outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) - continue; - - for (unsigned j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(ctx->ac.builder, addr[i][j], ""); - } - } -} - -/* Generate export instructions for hardware VS shader stage or NGG GS stage - * (position and parameter data only). - */ -void si_llvm_export_vs(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput) -{ - struct si_shader *shader = ctx->shader; - struct ac_export_args pos_args[4] = {}; - LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; - unsigned pos_idx; - int i; - - si_vertex_color_clamping(ctx, outputs, noutput); - - /* Build position exports. */ - for (i = 0; i < noutput; i++) { - switch (outputs[i].semantic_name) { - case TGSI_SEMANTIC_POSITION: - si_llvm_init_vs_export_args(ctx, outputs[i].values, - V_008DFC_SQ_EXP_POS, &pos_args[0]); - break; - case TGSI_SEMANTIC_PSIZE: - psize_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_LAYER: - layer_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_VIEWPORT_INDEX: - viewport_index_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_EDGEFLAG: - edgeflag_value = outputs[i].values[0]; - break; - case TGSI_SEMANTIC_CLIPDIST: - if (!shader->key.opt.clip_disable) { - unsigned index = 2 + outputs[i].semantic_index; - si_llvm_init_vs_export_args(ctx, outputs[i].values, - V_008DFC_SQ_EXP_POS + index, - &pos_args[index]); - } - break; - case TGSI_SEMANTIC_CLIPVERTEX: - if (!shader->key.opt.clip_disable) { - si_llvm_emit_clipvertex(ctx, pos_args, - outputs[i].values); - } - break; - } - } - - /* We need to add the position output manually if it's missing. */ - if (!pos_args[0].out[0]) { - pos_args[0].enabled_channels = 0xf; /* writemask */ - pos_args[0].valid_mask = 0; /* EXEC mask */ - pos_args[0].done = 0; /* last export? */ - pos_args[0].target = V_008DFC_SQ_EXP_POS; - pos_args[0].compr = 0; /* COMPR flag */ - pos_args[0].out[0] = ctx->ac.f32_0; /* X */ - pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ - pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ - pos_args[0].out[3] = ctx->ac.f32_1; /* W */ - } - - bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && - !shader->key.as_ngg; - - /* Write the misc vector (point size, edgeflag, layer, viewport). */ - if (shader->selector->info.writes_psize || - pos_writes_edgeflag || - shader->selector->info.writes_viewport_index || - shader->selector->info.writes_layer) { - pos_args[1].enabled_channels = shader->selector->info.writes_psize | - (pos_writes_edgeflag << 1) | - (shader->selector->info.writes_layer << 2); - - pos_args[1].valid_mask = 0; /* EXEC mask */ - pos_args[1].done = 0; /* last export? */ - pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; - pos_args[1].compr = 0; /* COMPR flag */ - pos_args[1].out[0] = ctx->ac.f32_0; /* X */ - pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ - pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ - pos_args[1].out[3] = ctx->ac.f32_0; /* W */ - - if (shader->selector->info.writes_psize) - pos_args[1].out[0] = psize_value; - - if (pos_writes_edgeflag) { - /* The output is a float, but the hw expects an integer - * with the first bit containing the edge flag. */ - edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, - edgeflag_value, - ctx->ac.i32, ""); - edgeflag_value = ac_build_umin(&ctx->ac, - edgeflag_value, - ctx->ac.i32_1); - - /* The LLVM intrinsic expects a float. */ - pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); - } - - if (ctx->screen->info.chip_class >= GFX9) { - /* GFX9 has the layer in out.z[10:0] and the viewport - * index in out.z[19:16]. - */ - if (shader->selector->info.writes_layer) - pos_args[1].out[2] = layer_value; - - if (shader->selector->info.writes_viewport_index) { - LLVMValueRef v = viewport_index_value; - - v = ac_to_integer(&ctx->ac, v); - v = LLVMBuildShl(ctx->ac.builder, v, - LLVMConstInt(ctx->ac.i32, 16, 0), ""); - v = LLVMBuildOr(ctx->ac.builder, v, - ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); - pos_args[1].out[2] = ac_to_float(&ctx->ac, v); - pos_args[1].enabled_channels |= 1 << 2; - } - } else { - if (shader->selector->info.writes_layer) - pos_args[1].out[2] = layer_value; - - if (shader->selector->info.writes_viewport_index) { - pos_args[1].out[3] = viewport_index_value; - pos_args[1].enabled_channels |= 1 << 3; - } - } - } - - for (i = 0; i < 4; i++) - if (pos_args[i].out[0]) - shader->info.nr_pos_exports++; - - /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. - * Setting valid_mask=1 prevents it and has no other effect. - */ - if (ctx->screen->info.family == CHIP_NAVI10 || - ctx->screen->info.family == CHIP_NAVI12 || - ctx->screen->info.family == CHIP_NAVI14) - pos_args[0].valid_mask = 1; - - pos_idx = 0; - for (i = 0; i < 4; i++) { - if (!pos_args[i].out[0]) - continue; - - /* Specify the target we are exporting */ - pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; - - if (pos_idx == shader->info.nr_pos_exports) - /* Specify that this is the last export */ - pos_args[i].done = 1; - - ac_build_export(&ctx->ac, &pos_args[i]); - } - - /* Build parameter exports. */ - si_build_param_exports(ctx, outputs, noutput); -} - -static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - struct si_shader_output_values *outputs = NULL; - int i,j; - - assert(!ctx->shader->is_gs_copy_shader); - assert(info->num_outputs <= max_outputs); - - outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); - - for (i = 0; i < info->num_outputs; i++) { - outputs[i].semantic_name = info->output_semantic_name[i]; - outputs[i].semantic_index = info->output_semantic_index[i]; - - for (j = 0; j < 4; j++) { - outputs[i].values[j] = - LLVMBuildLoad(ctx->ac.builder, - addrs[4 * i + j], - ""); - outputs[i].vertex_stream[j] = - (info->output_streams[i] >> (2 * j)) & 3; - } - } - - if (!ctx->screen->use_ngg_streamout && - ctx->shader->selector->so.num_outputs) - si_llvm_emit_streamout(ctx, outputs, i, 0); - - /* Export PrimitiveID. */ - if (ctx->shader->key.mono.u.vs_export_prim_id) { - outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; - outputs[i].semantic_index = 0; - outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0)); - for (j = 1; j < 4; j++) - outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0); - - memset(outputs[i].vertex_stream, 0, - sizeof(outputs[i].vertex_stream)); - i++; - } - - si_llvm_export_vs(ctx, outputs, i); - FREE(outputs); -} - -static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, - unsigned max_outputs, - LLVMValueRef *addrs) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - LLVMValueRef pos[4] = {}; - - assert(info->num_outputs <= max_outputs); - - for (unsigned i = 0; i < info->num_outputs; i++) { - if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION) - continue; - - for (unsigned chan = 0; chan < 4; chan++) - pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - break; - } - assert(pos[0] != NULL); - - /* Return the position output. */ - LLVMValueRef ret = ctx->return_value; - for (unsigned chan = 0; chan < 4; chan++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); - ctx->return_value = ret; -} - static void declare_streamout_params(struct si_shader_context *ctx, struct pipe_stream_output_info *so) { @@ -2274,18 +1482,6 @@ static void si_optimize_vs_outputs(struct si_shader_context *ctx) &shader->info.nr_param_exports); } -static void si_init_exec_from_input(struct si_shader_context *ctx, - struct ac_arg param, unsigned bitoffset) -{ - LLVMValueRef args[] = { - ac_get_arg(&ctx->ac, param), - LLVMConstInt(ctx->ac.i32, bitoffset, 0), - }; - ac_build_intrinsic(&ctx->ac, - "llvm.amdgcn.init.exec.from.input", - ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); -} - static bool si_vs_needs_prolog(const struct si_shader_selector *sel, const struct si_vs_prolog_bits *prolog_key, const struct si_shader_key *key, @@ -2310,34 +1506,13 @@ static bool si_build_main_function(struct si_shader_context *ctx, switch (ctx->type) { case PIPE_SHADER_VERTEX: - if (shader->key.as_ls) - ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; - else if (shader->key.as_es) - ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; - else if (shader->key.opt.vs_as_prim_discard_cs) - ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; - else if (ngg_cull_shader) - ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; - else if (shader->key.as_ngg) - ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; - else - ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; - ctx->abi.load_base_vertex = get_base_vertex; + si_llvm_init_vs_callbacks(ctx, ngg_cull_shader); break; case PIPE_SHADER_TESS_CTRL: si_llvm_init_tcs_callbacks(ctx); break; case PIPE_SHADER_TESS_EVAL: - si_llvm_init_tes_callbacks(ctx); - - if (shader->key.as_es) - ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; - else if (ngg_cull_shader) - ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; - else if (shader->key.as_ngg) - ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; - else - ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; + si_llvm_init_tes_callbacks(ctx, ngg_cull_shader); break; case PIPE_SHADER_GEOMETRY: si_llvm_init_gs_callbacks(ctx); @@ -2987,7 +2162,7 @@ int si_compile_shader(struct si_screen *sscreen, &shader->key.part.vs.prolog, shader, &prolog_key); prolog_key.vs_prolog.is_monolithic = true; - si_build_vs_prolog_function(&ctx, &prolog_key); + si_llvm_build_vs_prolog(&ctx, &prolog_key); parts[num_parts++] = ctx.main_fn; has_prolog = true; } @@ -3003,7 +2178,7 @@ int si_compile_shader(struct si_screen *sscreen, &shader->key.part.vs.prolog, shader, &prolog_key); prolog_key.vs_prolog.is_monolithic = true; - si_build_vs_prolog_function(&ctx, &prolog_key); + si_llvm_build_vs_prolog(&ctx, &prolog_key); parts[num_parts++] = ctx.main_fn; has_prolog = true; } @@ -3066,7 +2241,7 @@ int si_compile_shader(struct si_screen *sscreen, &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key); vs_prolog_key.vs_prolog.is_monolithic = true; - si_build_vs_prolog_function(&ctx, &vs_prolog_key); + si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); parts[0] = ctx.main_fn; } @@ -3137,7 +2312,7 @@ int si_compile_shader(struct si_screen *sscreen, &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key); vs_prolog_key.vs_prolog.is_monolithic = true; - si_build_vs_prolog_function(&ctx, &vs_prolog_key); + si_llvm_build_vs_prolog(&ctx, &vs_prolog_key); es_prolog = ctx.main_fn; } @@ -3344,260 +2519,6 @@ out: return result; } -/** - * Build the vertex shader prolog function. - * - * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). - * All inputs are returned unmodified. The vertex load indices are - * stored after them, which will be used by the API VS for fetching inputs. - * - * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: - * input_v0, - * input_v1, - * input_v2, - * input_v3, - * (VertexID + BaseVertex), - * (InstanceID + StartInstance), - * (InstanceID / 2 + StartInstance) - */ -static void si_build_vs_prolog_function(struct si_shader_context *ctx, - union si_shader_part_key *key) -{ - LLVMTypeRef *returns; - LLVMValueRef ret, func; - int num_returns, i; - unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; - unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 + - (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0); - struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs]; - struct ac_arg input_vgpr_param[13]; - LLVMValueRef input_vgprs[13]; - unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + - num_input_vgprs; - unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ - returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * - sizeof(LLVMTypeRef)); - num_returns = 0; - - /* Declare input and output SGPRs. */ - for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, - &input_sgpr_param[i]); - returns[num_returns++] = ctx->ac.i32; - } - - struct ac_arg merged_wave_info = input_sgpr_param[3]; - - /* Preloaded VGPRs (outputs must be floats) */ - for (i = 0; i < num_input_vgprs; i++) { - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]); - returns[num_returns++] = ctx->ac.f32; - } - - /* Vertex load indices. */ - for (i = 0; i < key->vs_prolog.num_inputs; i++) - returns[num_returns++] = ctx->ac.f32; - - /* Create the function. */ - si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0); - func = ctx->main_fn; - - for (i = 0; i < num_input_vgprs; i++) { - input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]); - } - - if (key->vs_prolog.num_merged_next_stage_vgprs) { - if (!key->vs_prolog.is_monolithic) - si_init_exec_from_input(ctx, merged_wave_info, 0); - - if (key->vs_prolog.as_ls && - ctx->screen->info.has_ls_vgpr_init_bug) { - /* If there are no HS threads, SPI loads the LS VGPRs - * starting at VGPR 0. Shift them back to where they - * belong. - */ - LLVMValueRef has_hs_threads = - LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, - si_unpack_param(ctx, input_sgpr_param[3], 8, 8), - ctx->ac.i32_0, ""); - - for (i = 4; i > 0; --i) { - input_vgprs[i + 1] = - LLVMBuildSelect(ctx->ac.builder, has_hs_threads, - input_vgprs[i + 1], - input_vgprs[i - 1], ""); - } - } - } - - if (key->vs_prolog.gs_fast_launch_tri_list || - key->vs_prolog.gs_fast_launch_tri_strip) { - LLVMValueRef wave_id, thread_id_in_tg; - - wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4); - thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id, - LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), - ac_get_thread_id(&ctx->ac)); - - /* The GS fast launch initializes all VGPRs to the value of - * the first thread, so we have to add the thread ID. - * - * Only these are initialized by the hw: - * VGPR2: Base Primitive ID - * VGPR5: Base Vertex ID - * VGPR6: Instance ID - */ - - /* Put the vertex thread IDs into VGPRs as-is instead of packing them. - * The NGG cull shader will read them from there. - */ - if (key->vs_prolog.gs_fast_launch_tri_list) { - input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */ - LLVMConstInt(ctx->ac.i32, 0, 0)); - input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */ - LLVMConstInt(ctx->ac.i32, 1, 0)); - input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */ - LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */ - LLVMConstInt(ctx->ac.i32, 2, 0)); - } else { - assert(key->vs_prolog.gs_fast_launch_tri_strip); - LLVMBuilderRef builder = ctx->ac.builder; - /* Triangle indices: */ - LLVMValueRef index[3] = { - thread_id_in_tg, - LLVMBuildAdd(builder, thread_id_in_tg, - LLVMConstInt(ctx->ac.i32, 1, 0), ""), - LLVMBuildAdd(builder, thread_id_in_tg, - LLVMConstInt(ctx->ac.i32, 2, 0), ""), - }; - LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, - thread_id_in_tg, ctx->ac.i1, ""); - LLVMValueRef flatshade_first = - LLVMBuildICmp(builder, LLVMIntEQ, - si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), - ctx->ac.i32_0, ""); - - ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, - flatshade_first, index); - input_vgprs[0] = index[0]; - input_vgprs[1] = index[1]; - input_vgprs[4] = index[2]; - } - - /* Triangles always have all edge flags set initially. */ - input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0); - - input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], - thread_id_in_tg, ""); /* PrimID */ - input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], - thread_id_in_tg, ""); /* VertexID */ - input_vgprs[8] = input_vgprs[6]; /* InstanceID */ - } - - unsigned vertex_id_vgpr = first_vs_vgpr; - unsigned instance_id_vgpr = - ctx->screen->info.chip_class >= GFX10 ? - first_vs_vgpr + 3 : - first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1); - - ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr]; - ctx->abi.instance_id = input_vgprs[instance_id_vgpr]; - - /* InstanceID = VertexID >> 16; - * VertexID = VertexID & 0xffff; - */ - if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) { - ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, - LLVMConstInt(ctx->ac.i32, 16, 0), ""); - ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id, - LLVMConstInt(ctx->ac.i32, 0xffff, 0), ""); - } - - /* Copy inputs to outputs. This should be no-op, as the registers match, - * but it will prevent the compiler from overwriting them unintentionally. - */ - ret = ctx->return_value; - for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { - LLVMValueRef p = LLVMGetParam(func, i); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); - } - for (i = 0; i < num_input_vgprs; i++) { - LLVMValueRef p = input_vgprs[i]; - - if (i == vertex_id_vgpr) - p = ctx->abi.vertex_id; - else if (i == instance_id_vgpr) - p = ctx->abi.instance_id; - - p = ac_to_float(&ctx->ac, p); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, - key->vs_prolog.num_input_sgprs + i, ""); - } - - /* Compute vertex load indices from instance divisors. */ - LLVMValueRef instance_divisor_constbuf = NULL; - - if (key->vs_prolog.states.instance_divisor_is_fetched) { - LLVMValueRef list = si_prolog_get_rw_buffers(ctx); - LLVMValueRef buf_index = - LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); - instance_divisor_constbuf = - ac_build_load_to_sgpr(&ctx->ac, list, buf_index); - } - - for (i = 0; i < key->vs_prolog.num_inputs; i++) { - bool divisor_is_one = - key->vs_prolog.states.instance_divisor_is_one & (1u << i); - bool divisor_is_fetched = - key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); - LLVMValueRef index = NULL; - - if (divisor_is_one) { - index = ctx->abi.instance_id; - } else if (divisor_is_fetched) { - LLVMValueRef udiv_factors[4]; - - for (unsigned j = 0; j < 4; j++) { - udiv_factors[j] = - si_buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0)); - udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); - } - /* The faster NUW version doesn't work when InstanceID == UINT_MAX. - * Such InstanceID might not be achievable in a reasonable time though. - */ - index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, - udiv_factors[0], udiv_factors[1], - udiv_factors[2], udiv_factors[3]); - } - - if (divisor_is_one || divisor_is_fetched) { - /* Add StartInstance. */ - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMGetParam(ctx->main_fn, user_sgpr_base + - SI_SGPR_START_INSTANCE), ""); - } else { - /* VertexID + BaseVertex */ - index = LLVMBuildAdd(ctx->ac.builder, - ctx->abi.vertex_id, - LLVMGetParam(func, user_sgpr_base + - SI_SGPR_BASE_VERTEX), ""); - } - - index = ac_to_float(&ctx->ac, index); - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, - ctx->args.arg_count + i, ""); - } - - si_llvm_build_ret(ctx, ret); -} - static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, @@ -3618,7 +2539,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen, shader->prolog = si_get_shader_part(sscreen, &sscreen->vs_prologs, PIPE_SHADER_VERTEX, true, &prolog_key, compiler, - debug, si_build_vs_prolog_function, + debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog"); return shader->prolog != NULL; } diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 91b581294d2..e0f71b4635e 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -231,28 +231,16 @@ LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMValueRef val2); void si_llvm_emit_barrier(struct si_shader_context *ctx); void si_llvm_declare_esgs_ring(struct si_shader_context *ctx); +void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, + unsigned bitoffset); void si_declare_compute_memory(struct si_shader_context *ctx); LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle); -void si_llvm_export_vs(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput); -void si_emit_streamout_output(struct si_shader_context *ctx, - LLVMValueRef const *so_buffers, - LLVMValueRef const *so_write_offsets, - struct pipe_stream_output *stream_out, - struct si_shader_output_values *shader_out); void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers, enum ac_arg_type type, struct ac_arg *arg, unsigned idx); - -void si_llvm_load_input_vs( - struct si_shader_context *ctx, - unsigned input_index, - LLVMValueRef out[4]); - bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir); LLVMValueRef si_unpack_param(struct si_shader_context *ctx, @@ -283,9 +271,6 @@ int si_compile_llvm(struct si_screen *sscreen, const char *name, bool less_optimized); void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader); -void si_llvm_emit_streamout(struct si_shader_context *ctx, - struct si_shader_output_values *outputs, - unsigned noutput, unsigned stream); void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader); bool gfx10_ngg_export_prim_early(struct si_shader *shader); @@ -324,7 +309,7 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key); void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx); -void si_llvm_init_tes_callbacks(struct si_shader_context *ctx); +void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); /* si_shader_llvm_ps.c */ void si_llvm_build_ps_prolog(struct si_shader_context *ctx, @@ -338,4 +323,23 @@ void si_llvm_init_ps_callbacks(struct si_shader_context *ctx); /* si_shader_llvm_resources.c */ void si_llvm_init_resource_callbacks(struct si_shader_context *ctx); +/* si_shader_llvm_vs.c */ +void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir); +void si_llvm_streamout_store_output(struct si_shader_context *ctx, + LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + struct pipe_stream_output *stream_out, + struct si_shader_output_values *shader_out); +void si_llvm_emit_streamout(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream); +void si_llvm_build_vs_exports(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput); +void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs); +void si_llvm_build_vs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key); +void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader); + #endif diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_build.c b/src/gallium/drivers/radeonsi/si_shader_llvm_build.c index ec7629514a7..829b9a2fb33 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_build.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_build.c @@ -129,3 +129,15 @@ void si_llvm_declare_esgs_ring(struct si_shader_context *ctx) LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage); LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); } + +void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, + unsigned bitoffset) +{ + LLVMValueRef args[] = { + ac_get_arg(&ctx->ac, param), + LLVMConstInt(ctx->ac.i32, bitoffset, 0), + }; + ac_build_intrinsic(&ctx->ac, + "llvm.amdgcn.init.exec.from.input", + ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT); +} diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index f88bde7a019..de3a5cb95a2 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -609,7 +609,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, } if (stream == 0) - si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs); + si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs); LLVMBuildBr(builder, end_bb); } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index b83e26fc582..a9f6e76f1f0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -1277,10 +1277,19 @@ void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx) ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; } -void si_llvm_init_tes_callbacks(struct si_shader_context *ctx) +void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader) { ctx->abi.load_tess_varyings = si_nir_load_input_tes; ctx->abi.load_tess_coord = si_load_tess_coord; ctx->abi.load_tess_level = si_load_tess_level; ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in; + + if (ctx->shader->key.as_es) + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; + else if (ngg_cull_shader) + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + else if (ctx->shader->key.as_ngg) + ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; + else + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c new file mode 100644 index 00000000000..4a56bdf81cf --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -0,0 +1,1130 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "si_shader_internal.h" +#include "si_pipe.h" +#include "sid.h" +#include "util/u_memory.h" + +static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, + LLVMValueRef i32, unsigned index) +{ + assert(index <= 1); + + if (index == 1) + return LLVMBuildAShr(ctx->ac.builder, i32, + LLVMConstInt(ctx->ac.i32, 16, 0), ""); + + return LLVMBuildSExt(ctx->ac.builder, + LLVMBuildTrunc(ctx->ac.builder, i32, + ctx->ac.i16, ""), + ctx->ac.i32, ""); +} + +static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, + LLVMValueRef out[4]) +{ + const struct si_shader_info *info = &ctx->shader->selector->info; + unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]; + + if (vs_blit_property) { + LLVMValueRef vertex_id = ctx->abi.vertex_id; + LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder, + LLVMIntULE, vertex_id, + ctx->ac.i32_1, ""); + /* Use LLVMIntNE, because we have 3 vertices and only + * the middle one should use y2. + */ + LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, + LLVMIntNE, vertex_id, + ctx->ac.i32_1, ""); + + unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index; + if (input_index == 0) { + /* Position: */ + LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs); + LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 1); + + LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0); + LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1); + LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0); + LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1); + + LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, + x1, x2, ""); + LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, + y1, y2, ""); + + out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, ""); + out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, ""); + out[2] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 2); + out[3] = ctx->ac.f32_1; + return; + } + + /* Color or texture coordinates: */ + assert(input_index == 1); + + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + for (int i = 0; i < 4; i++) { + out[i] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 3 + i); + } + } else { + assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); + LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 3); + LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 4); + LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 5); + LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 6); + + out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, + x1, x2, ""); + out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, + y1, y2, ""); + out[2] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 7); + out[3] = LLVMGetParam(ctx->main_fn, + param_vs_blit_inputs + 8); + } + return; + } + + unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; + union si_vs_fix_fetch fix_fetch; + LLVMValueRef vb_desc; + LLVMValueRef vertex_index; + LLVMValueRef tmp; + + if (input_index < num_vbos_in_user_sgprs) { + vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]); + } else { + unsigned index= input_index - num_vbos_in_user_sgprs; + vb_desc = ac_build_load_to_sgpr(&ctx->ac, + ac_get_arg(&ctx->ac, ctx->vertex_buffers), + LLVMConstInt(ctx->ac.i32, index, 0)); + } + + vertex_index = LLVMGetParam(ctx->main_fn, + ctx->vertex_index0.arg_index + + input_index); + + /* Use the open-coded implementation for all loads of doubles and + * of dword-sized data that needs fixups. We need to insert conversion + * code anyway, and the amd/common code does it for us. + * + * Note: On LLVM <= 8, we can only open-code formats with + * channel size >= 4 bytes. + */ + bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index); + fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits; + if (opencode || + (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || + (fix_fetch.u.log_size == 2)) { + tmp = ac_build_opencoded_load_format( + &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, + fix_fetch.u.format, fix_fetch.u.reverse, !opencode, + vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true); + for (unsigned i = 0; i < 4; ++i) + out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), ""); + return; + } + + /* Do multiple loads for special formats. */ + unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]); + LLVMValueRef fetches[4]; + unsigned num_fetches; + unsigned fetch_stride; + unsigned channels_per_fetch; + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { + num_fetches = MIN2(required_channels, 3); + fetch_stride = 1 << fix_fetch.u.log_size; + channels_per_fetch = 1; + } else { + num_fetches = 1; + fetch_stride = 0; + channels_per_fetch = required_channels; + } + + for (unsigned i = 0; i < num_fetches; ++i) { + LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0); + fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, + channels_per_fetch, 0, true); + } + + if (num_fetches == 1 && channels_per_fetch > 1) { + LLVMValueRef fetch = fetches[0]; + for (unsigned i = 0; i < channels_per_fetch; ++i) { + tmp = LLVMConstInt(ctx->ac.i32, i, false); + fetches[i] = LLVMBuildExtractElement( + ctx->ac.builder, fetch, tmp, ""); + } + num_fetches = channels_per_fetch; + channels_per_fetch = 1; + } + + for (unsigned i = num_fetches; i < 4; ++i) + fetches[i] = LLVMGetUndef(ctx->ac.f32); + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && + required_channels == 4) { + if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) + fetches[3] = ctx->ac.i32_1; + else + fetches[3] = ctx->ac.f32_1; + } else if (fix_fetch.u.log_size == 3 && + (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || + fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || + fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && + required_channels == 4) { + /* For 2_10_10_10, the hardware returns an unsigned value; + * convert it to a signed one. + */ + LLVMValueRef tmp = fetches[3]; + LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0); + + /* First, recover the sign-extended signed integer value. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) + tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, ""); + else + tmp = ac_to_integer(&ctx->ac, tmp); + + /* For the integer-like cases, do a natural sign extension. + * + * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 + * and happen to contain 0, 1, 2, 3 as the two LSBs of the + * exponent. + */ + tmp = LLVMBuildShl(ctx->ac.builder, tmp, + fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? + LLVMConstInt(ctx->ac.i32, 7, 0) : c30, ""); + tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, ""); + + /* Convert back to the right type. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { + LLVMValueRef clamp; + LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0); + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); + clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, ""); + tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, ""); + } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { + tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, ""); + } + + fetches[3] = tmp; + } + + for (unsigned i = 0; i < 4; ++i) + out[i] = ac_to_float(&ctx->ac, fetches[i]); +} + +static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index) +{ + LLVMValueRef input[4]; + + load_input_vs(ctx, input_index / 4, input); + + for (unsigned chan = 0; chan < 4; chan++) { + ctx->inputs[input_index + chan] = + LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, ""); + } +} + +void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir) +{ + uint64_t processed_inputs = 0; + + nir_foreach_variable(variable, &nir->inputs) { + unsigned attrib_count = glsl_count_attribute_slots(variable->type, + true); + unsigned input_idx = variable->data.driver_location; + unsigned loc = variable->data.location; + + for (unsigned i = 0; i < attrib_count; i++) { + /* Packed components share the same location so skip + * them if we have already processed the location. + */ + if (processed_inputs & ((uint64_t)1 << (loc + i))) { + input_idx += 4; + continue; + } + + declare_input_vs(ctx, input_idx); + if (glsl_type_is_dual_slot(variable->type)) { + input_idx += 4; + declare_input_vs(ctx, input_idx); + } + + processed_inputs |= ((uint64_t)1 << (loc + i)); + input_idx += 4; + } + } +} + +void si_llvm_streamout_store_output(struct si_shader_context *ctx, + LLVMValueRef const *so_buffers, + LLVMValueRef const *so_write_offsets, + struct pipe_stream_output *stream_out, + struct si_shader_output_values *shader_out) +{ + unsigned buf_idx = stream_out->output_buffer; + unsigned start = stream_out->start_component; + unsigned num_comps = stream_out->num_components; + LLVMValueRef out[4]; + + assert(num_comps && num_comps <= 4); + if (!num_comps || num_comps > 4) + return; + + /* Load the output as int. */ + for (int j = 0; j < num_comps; j++) { + assert(stream_out->stream == shader_out->vertex_stream[start + j]); + + out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]); + } + + /* Pack the output. */ + LLVMValueRef vdata = NULL; + + switch (num_comps) { + case 1: /* as i32 */ + vdata = out[0]; + break; + case 2: /* as v2i32 */ + case 3: /* as v3i32 */ + if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) { + vdata = ac_build_gather_values(&ctx->ac, out, num_comps); + break; + } + /* as v4i32 (aligned to 4) */ + out[3] = LLVMGetUndef(ctx->ac.i32); + /* fall through */ + case 4: /* as v4i32 */ + vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps)); + break; + } + + ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], + vdata, num_comps, + so_write_offsets[buf_idx], + ctx->ac.i32_0, + stream_out->dst_offset * 4, ac_glc | ac_slc); +} + +/** + * Write streamout data to buffers for vertex stream @p stream (different + * vertex streams can occur for GS copy shaders). + */ +void si_llvm_emit_streamout(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput, unsigned stream) +{ + struct si_shader_selector *sel = ctx->shader->selector; + struct pipe_stream_output_info *so = &sel->so; + LLVMBuilderRef builder = ctx->ac.builder; + int i; + + /* Get bits [22:16], i.e. (so_param >> 16) & 127; */ + LLVMValueRef so_vtx_count = + si_unpack_param(ctx, ctx->streamout_config, 16, 7); + + LLVMValueRef tid = ac_get_thread_id(&ctx->ac); + + /* can_emit = tid < so_vtx_count; */ + LLVMValueRef can_emit = + LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, ""); + + /* Emit the streamout code conditionally. This actually avoids + * out-of-bounds buffer access. The hw tells us via the SGPR + * (so_vtx_count) which threads are allowed to emit streamout data. */ + ac_build_ifcc(&ctx->ac, can_emit, 6501); + { + /* The buffer offset is computed as follows: + * ByteOffset = streamout_offset[buffer_id]*4 + + * (streamout_write_index + thread_id)*stride[buffer_id] + + * attrib_offset + */ + + LLVMValueRef so_write_index = + ac_get_arg(&ctx->ac, + ctx->streamout_write_index); + + /* Compute (streamout_write_index + thread_id). */ + so_write_index = LLVMBuildAdd(builder, so_write_index, tid, ""); + + /* Load the descriptor and compute the write offset for each + * enabled buffer. */ + LLVMValueRef so_write_offset[4] = {}; + LLVMValueRef so_buffers[4]; + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, + ctx->rw_buffers); + + for (i = 0; i < 4; i++) { + if (!so->stride[i]) + continue; + + LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, + SI_VS_STREAMOUT_BUF0 + i, 0); + + so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + + LLVMValueRef so_offset = ac_get_arg(&ctx->ac, + ctx->streamout_offset[i]); + so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), ""); + + so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index, + LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0), + so_offset); + } + + /* Write streamout data. */ + for (i = 0; i < so->num_outputs; i++) { + unsigned reg = so->output[i].register_index; + + if (reg >= noutput) + continue; + + if (stream != so->output[i].stream) + continue; + + si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, + &so->output[i], &outputs[reg]); + } + } + ac_build_endif(&ctx->ac, 6501); +} + +static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, + struct ac_export_args *pos, LLVMValueRef *out_elts) +{ + unsigned reg_index; + unsigned chan; + unsigned const_chan; + LLVMValueRef base_elt; + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers); + LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, + SI_VS_CONST_CLIP_PLANES, 0); + LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index); + + for (reg_index = 0; reg_index < 2; reg_index ++) { + struct ac_export_args *args = &pos[2 + reg_index]; + + args->out[0] = + args->out[1] = + args->out[2] = + args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f); + + /* Compute dot products of position and user clip plane vectors */ + for (chan = 0; chan < 4; chan++) { + for (const_chan = 0; const_chan < 4; const_chan++) { + LLVMValueRef addr = + LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + + const_chan) * 4, 0); + base_elt = si_buffer_load_const(ctx, const_resource, + addr); + args->out[chan] = ac_build_fmad(&ctx->ac, base_elt, + out_elts[const_chan], args->out[chan]); + } + } + + args->enabled_channels = 0xf; + args->valid_mask = 0; + args->done = 0; + args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index; + args->compr = 0; + } +} + +/* Initialize arguments for the shader export intrinsic */ +static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, + LLVMValueRef *values, + unsigned target, + struct ac_export_args *args) +{ + args->enabled_channels = 0xf; /* writemask - default is 0xf */ + args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */ + args->done = 0; /* Specify whether this is the last export */ + args->target = target; /* Specify the target we are exporting */ + args->compr = false; + + memcpy(&args->out[0], values, sizeof(values[0]) * 4); +} + +static void si_export_param(struct si_shader_context *ctx, unsigned index, + LLVMValueRef *values) +{ + struct ac_export_args args; + + si_llvm_init_vs_export_args(ctx, values, + V_008DFC_SQ_EXP_PARAM + index, &args); + ac_build_export(&ctx->ac, &args); +} + +static void si_build_param_exports(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) +{ + struct si_shader *shader = ctx->shader; + unsigned param_count = 0; + + for (unsigned i = 0; i < noutput; i++) { + unsigned semantic_name = outputs[i].semantic_name; + unsigned semantic_index = outputs[i].semantic_index; + + if (outputs[i].vertex_stream[0] != 0 && + outputs[i].vertex_stream[1] != 0 && + outputs[i].vertex_stream[2] != 0 && + outputs[i].vertex_stream[3] != 0) + continue; + + switch (semantic_name) { + case TGSI_SEMANTIC_LAYER: + case TGSI_SEMANTIC_VIEWPORT_INDEX: + case TGSI_SEMANTIC_CLIPDIST: + case TGSI_SEMANTIC_COLOR: + case TGSI_SEMANTIC_BCOLOR: + case TGSI_SEMANTIC_PRIMID: + case TGSI_SEMANTIC_FOG: + case TGSI_SEMANTIC_TEXCOORD: + case TGSI_SEMANTIC_GENERIC: + break; + default: + continue; + } + + if ((semantic_name != TGSI_SEMANTIC_GENERIC || + semantic_index < SI_MAX_IO_GENERIC) && + shader->key.opt.kill_outputs & + (1ull << si_shader_io_get_unique_index(semantic_name, + semantic_index, true))) + continue; + + si_export_param(ctx, param_count, outputs[i].values); + + assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); + shader->info.vs_output_param_offset[i] = param_count++; + } + + shader->info.nr_param_exports = param_count; +} + +/** + * Vertex color clamping. + * + * This uses a state constant loaded in a user data SGPR and + * an IF statement is added that clamps all colors if the constant + * is true. + */ +static void si_vertex_color_clamping(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) +{ + LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4]; + bool has_colors = false; + + /* Store original colors to alloca variables. */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, ""); + LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]); + } + has_colors = true; + } + + if (!has_colors) + return; + + /* The state is in the first bit of the user SGPR. */ + LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits); + cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, ""); + + ac_build_ifcc(&ctx->ac, cond, 6502); + + /* Store clamped colors to alloca variables within the conditional block. */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + LLVMBuildStore(ctx->ac.builder, + ac_build_clamp(&ctx->ac, outputs[i].values[j]), + addr[i][j]); + } + } + ac_build_endif(&ctx->ac, 6502); + + /* Load clamped colors */ + for (unsigned i = 0; i < noutput; i++) { + if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR && + outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR) + continue; + + for (unsigned j = 0; j < 4; j++) { + outputs[i].values[j] = + LLVMBuildLoad(ctx->ac.builder, addr[i][j], ""); + } + } +} + +/* Generate export instructions for hardware VS shader stage or NGG GS stage + * (position and parameter data only). + */ +void si_llvm_build_vs_exports(struct si_shader_context *ctx, + struct si_shader_output_values *outputs, + unsigned noutput) +{ + struct si_shader *shader = ctx->shader; + struct ac_export_args pos_args[4] = {}; + LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL; + unsigned pos_idx; + int i; + + si_vertex_color_clamping(ctx, outputs, noutput); + + /* Build position exports. */ + for (i = 0; i < noutput; i++) { + switch (outputs[i].semantic_name) { + case TGSI_SEMANTIC_POSITION: + si_llvm_init_vs_export_args(ctx, outputs[i].values, + V_008DFC_SQ_EXP_POS, &pos_args[0]); + break; + case TGSI_SEMANTIC_PSIZE: + psize_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_LAYER: + layer_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_VIEWPORT_INDEX: + viewport_index_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_EDGEFLAG: + edgeflag_value = outputs[i].values[0]; + break; + case TGSI_SEMANTIC_CLIPDIST: + if (!shader->key.opt.clip_disable) { + unsigned index = 2 + outputs[i].semantic_index; + si_llvm_init_vs_export_args(ctx, outputs[i].values, + V_008DFC_SQ_EXP_POS + index, + &pos_args[index]); + } + break; + case TGSI_SEMANTIC_CLIPVERTEX: + if (!shader->key.opt.clip_disable) { + si_llvm_emit_clipvertex(ctx, pos_args, + outputs[i].values); + } + break; + } + } + + /* We need to add the position output manually if it's missing. */ + if (!pos_args[0].out[0]) { + pos_args[0].enabled_channels = 0xf; /* writemask */ + pos_args[0].valid_mask = 0; /* EXEC mask */ + pos_args[0].done = 0; /* last export? */ + pos_args[0].target = V_008DFC_SQ_EXP_POS; + pos_args[0].compr = 0; /* COMPR flag */ + pos_args[0].out[0] = ctx->ac.f32_0; /* X */ + pos_args[0].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[0].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[0].out[3] = ctx->ac.f32_1; /* W */ + } + + bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && + !shader->key.as_ngg; + + /* Write the misc vector (point size, edgeflag, layer, viewport). */ + if (shader->selector->info.writes_psize || + pos_writes_edgeflag || + shader->selector->info.writes_viewport_index || + shader->selector->info.writes_layer) { + pos_args[1].enabled_channels = shader->selector->info.writes_psize | + (pos_writes_edgeflag << 1) | + (shader->selector->info.writes_layer << 2); + + pos_args[1].valid_mask = 0; /* EXEC mask */ + pos_args[1].done = 0; /* last export? */ + pos_args[1].target = V_008DFC_SQ_EXP_POS + 1; + pos_args[1].compr = 0; /* COMPR flag */ + pos_args[1].out[0] = ctx->ac.f32_0; /* X */ + pos_args[1].out[1] = ctx->ac.f32_0; /* Y */ + pos_args[1].out[2] = ctx->ac.f32_0; /* Z */ + pos_args[1].out[3] = ctx->ac.f32_0; /* W */ + + if (shader->selector->info.writes_psize) + pos_args[1].out[0] = psize_value; + + if (pos_writes_edgeflag) { + /* The output is a float, but the hw expects an integer + * with the first bit containing the edge flag. */ + edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, + edgeflag_value, + ctx->ac.i32, ""); + edgeflag_value = ac_build_umin(&ctx->ac, + edgeflag_value, + ctx->ac.i32_1); + + /* The LLVM intrinsic expects a float. */ + pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value); + } + + if (ctx->screen->info.chip_class >= GFX9) { + /* GFX9 has the layer in out.z[10:0] and the viewport + * index in out.z[19:16]. + */ + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; + + if (shader->selector->info.writes_viewport_index) { + LLVMValueRef v = viewport_index_value; + + v = ac_to_integer(&ctx->ac, v); + v = LLVMBuildShl(ctx->ac.builder, v, + LLVMConstInt(ctx->ac.i32, 16, 0), ""); + v = LLVMBuildOr(ctx->ac.builder, v, + ac_to_integer(&ctx->ac, pos_args[1].out[2]), ""); + pos_args[1].out[2] = ac_to_float(&ctx->ac, v); + pos_args[1].enabled_channels |= 1 << 2; + } + } else { + if (shader->selector->info.writes_layer) + pos_args[1].out[2] = layer_value; + + if (shader->selector->info.writes_viewport_index) { + pos_args[1].out[3] = viewport_index_value; + pos_args[1].enabled_channels |= 1 << 3; + } + } + } + + for (i = 0; i < 4; i++) + if (pos_args[i].out[0]) + shader->info.nr_pos_exports++; + + /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + * Setting valid_mask=1 prevents it and has no other effect. + */ + if (ctx->screen->info.family == CHIP_NAVI10 || + ctx->screen->info.family == CHIP_NAVI12 || + ctx->screen->info.family == CHIP_NAVI14) + pos_args[0].valid_mask = 1; + + pos_idx = 0; + for (i = 0; i < 4; i++) { + if (!pos_args[i].out[0]) + continue; + + /* Specify the target we are exporting */ + pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++; + + if (pos_idx == shader->info.nr_pos_exports) + /* Specify that this is the last export */ + pos_args[i].done = 1; + + ac_build_export(&ctx->ac, &pos_args[i]); + } + + /* Build parameter exports. */ + si_build_param_exports(ctx, outputs, noutput); +} + +void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + struct si_shader_output_values *outputs = NULL; + int i,j; + + assert(!ctx->shader->is_gs_copy_shader); + assert(info->num_outputs <= max_outputs); + + outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0])); + + for (i = 0; i < info->num_outputs; i++) { + outputs[i].semantic_name = info->output_semantic_name[i]; + outputs[i].semantic_index = info->output_semantic_index[i]; + + for (j = 0; j < 4; j++) { + outputs[i].values[j] = + LLVMBuildLoad(ctx->ac.builder, + addrs[4 * i + j], + ""); + outputs[i].vertex_stream[j] = + (info->output_streams[i] >> (2 * j)) & 3; + } + } + + if (!ctx->screen->use_ngg_streamout && + ctx->shader->selector->so.num_outputs) + si_llvm_emit_streamout(ctx, outputs, i, 0); + + /* Export PrimitiveID. */ + if (ctx->shader->key.mono.u.vs_export_prim_id) { + outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID; + outputs[i].semantic_index = 0; + outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0)); + for (j = 1; j < 4; j++) + outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0); + + memset(outputs[i].vertex_stream, 0, + sizeof(outputs[i].vertex_stream)); + i++; + } + + si_llvm_build_vs_exports(ctx, outputs, i); + FREE(outputs); +} + +static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, + unsigned max_outputs, + LLVMValueRef *addrs) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + struct si_shader_info *info = &ctx->shader->selector->info; + LLVMValueRef pos[4] = {}; + + assert(info->num_outputs <= max_outputs); + + for (unsigned i = 0; i < info->num_outputs; i++) { + if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION) + continue; + + for (unsigned chan = 0; chan < 4; chan++) + pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); + break; + } + assert(pos[0] != NULL); + + /* Return the position output. */ + LLVMValueRef ret = ctx->return_value; + for (unsigned chan = 0; chan < 4; chan++) + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); + ctx->return_value = ret; +} + +/** + * Build the vertex shader prolog function. + * + * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). + * All inputs are returned unmodified. The vertex load indices are + * stored after them, which will be used by the API VS for fetching inputs. + * + * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: + * input_v0, + * input_v1, + * input_v2, + * input_v3, + * (VertexID + BaseVertex), + * (InstanceID + StartInstance), + * (InstanceID / 2 + StartInstance) + */ +void si_llvm_build_vs_prolog(struct si_shader_context *ctx, + union si_shader_part_key *key) +{ + LLVMTypeRef *returns; + LLVMValueRef ret, func; + int num_returns, i; + unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs; + unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4; + struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs]; + struct ac_arg input_vgpr_param[9]; + LLVMValueRef input_vgprs[9]; + unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + + num_input_vgprs; + unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; + + memset(&ctx->args, 0, sizeof(ctx->args)); + + /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ + returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * + sizeof(LLVMTypeRef)); + num_returns = 0; + + /* Declare input and output SGPRs. */ + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, + &input_sgpr_param[i]); + returns[num_returns++] = ctx->ac.i32; + } + + struct ac_arg merged_wave_info = input_sgpr_param[3]; + + /* Preloaded VGPRs (outputs must be floats) */ + for (i = 0; i < num_input_vgprs; i++) { + ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]); + returns[num_returns++] = ctx->ac.f32; + } + + /* Vertex load indices. */ + for (i = 0; i < key->vs_prolog.num_inputs; i++) + returns[num_returns++] = ctx->ac.f32; + + /* Create the function. */ + si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0); + func = ctx->main_fn; + + for (i = 0; i < num_input_vgprs; i++) { + input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]); + } + + if (key->vs_prolog.num_merged_next_stage_vgprs) { + if (!key->vs_prolog.is_monolithic) + si_init_exec_from_input(ctx, merged_wave_info, 0); + + if (key->vs_prolog.as_ls && + ctx->screen->info.has_ls_vgpr_init_bug) { + /* If there are no HS threads, SPI loads the LS VGPRs + * starting at VGPR 0. Shift them back to where they + * belong. + */ + LLVMValueRef has_hs_threads = + LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, + si_unpack_param(ctx, input_sgpr_param[3], 8, 8), + ctx->ac.i32_0, ""); + + for (i = 4; i > 0; --i) { + input_vgprs[i + 1] = + LLVMBuildSelect(ctx->ac.builder, has_hs_threads, + input_vgprs[i + 1], + input_vgprs[i - 1], ""); + } + } + } + + if (key->vs_prolog.gs_fast_launch_tri_list || + key->vs_prolog.gs_fast_launch_tri_strip) { + LLVMValueRef wave_id, thread_id_in_tg; + + wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4); + thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id, + LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), + ac_get_thread_id(&ctx->ac)); + + /* The GS fast launch initializes all VGPRs to the value of + * the first thread, so we have to add the thread ID. + * + * Only these are initialized by the hw: + * VGPR2: Base Primitive ID + * VGPR5: Base Vertex ID + * VGPR6: Instance ID + */ + + /* Put the vertex thread IDs into VGPRs as-is instead of packing them. + * The NGG cull shader will read them from there. + */ + if (key->vs_prolog.gs_fast_launch_tri_list) { + input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */ + LLVMConstInt(ctx->ac.i32, 0, 0)); + input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */ + LLVMConstInt(ctx->ac.i32, 1, 0)); + input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */ + LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */ + LLVMConstInt(ctx->ac.i32, 2, 0)); + } else { + assert(key->vs_prolog.gs_fast_launch_tri_strip); + LLVMBuilderRef builder = ctx->ac.builder; + /* Triangle indices: */ + LLVMValueRef index[3] = { + thread_id_in_tg, + LLVMBuildAdd(builder, thread_id_in_tg, + LLVMConstInt(ctx->ac.i32, 1, 0), ""), + LLVMBuildAdd(builder, thread_id_in_tg, + LLVMConstInt(ctx->ac.i32, 2, 0), ""), + }; + LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, + thread_id_in_tg, ctx->ac.i1, ""); + LLVMValueRef flatshade_first = + LLVMBuildICmp(builder, LLVMIntEQ, + si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), + ctx->ac.i32_0, ""); + + ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, + flatshade_first, index); + input_vgprs[0] = index[0]; + input_vgprs[1] = index[1]; + input_vgprs[4] = index[2]; + } + + /* Triangles always have all edge flags set initially. */ + input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0); + + input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], + thread_id_in_tg, ""); /* PrimID */ + input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], + thread_id_in_tg, ""); /* VertexID */ + input_vgprs[8] = input_vgprs[6]; /* InstanceID */ + } + + unsigned vertex_id_vgpr = first_vs_vgpr; + unsigned instance_id_vgpr = + ctx->screen->info.chip_class >= GFX10 ? + first_vs_vgpr + 3 : + first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1); + + ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr]; + ctx->abi.instance_id = input_vgprs[instance_id_vgpr]; + + /* InstanceID = VertexID >> 16; + * VertexID = VertexID & 0xffff; + */ + if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) { + ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, + LLVMConstInt(ctx->ac.i32, 16, 0), ""); + ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id, + LLVMConstInt(ctx->ac.i32, 0xffff, 0), ""); + } + + /* Copy inputs to outputs. This should be no-op, as the registers match, + * but it will prevent the compiler from overwriting them unintentionally. + */ + ret = ctx->return_value; + for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) { + LLVMValueRef p = LLVMGetParam(func, i); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, ""); + } + for (i = 0; i < num_input_vgprs; i++) { + LLVMValueRef p = input_vgprs[i]; + + if (i == vertex_id_vgpr) + p = ctx->abi.vertex_id; + else if (i == instance_id_vgpr) + p = ctx->abi.instance_id; + + p = ac_to_float(&ctx->ac, p); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, + key->vs_prolog.num_input_sgprs + i, ""); + } + + /* Compute vertex load indices from instance divisors. */ + LLVMValueRef instance_divisor_constbuf = NULL; + + if (key->vs_prolog.states.instance_divisor_is_fetched) { + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); + LLVMValueRef buf_index = + LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); + instance_divisor_constbuf = + ac_build_load_to_sgpr(&ctx->ac, list, buf_index); + } + + for (i = 0; i < key->vs_prolog.num_inputs; i++) { + bool divisor_is_one = + key->vs_prolog.states.instance_divisor_is_one & (1u << i); + bool divisor_is_fetched = + key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); + LLVMValueRef index = NULL; + + if (divisor_is_one) { + index = ctx->abi.instance_id; + } else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; + + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = + si_buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); + } + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, + udiv_factors[0], udiv_factors[1], + udiv_factors[2], udiv_factors[3]); + } + + if (divisor_is_one || divisor_is_fetched) { + /* Add StartInstance. */ + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, user_sgpr_base + + SI_SGPR_START_INSTANCE), ""); + } else { + /* VertexID + BaseVertex */ + index = LLVMBuildAdd(ctx->ac.builder, + ctx->abi.vertex_id, + LLVMGetParam(func, user_sgpr_base + + SI_SGPR_BASE_VERTEX), ""); + } + + index = ac_to_float(&ctx->ac, index); + ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, + ctx->args.arg_count + i, ""); + } + + si_llvm_build_ret(ctx, ret); +} + +static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + /* For non-indexed draws, the base vertex set by the driver + * (for direct draws) or the CP (for indirect draws) is the + * first vertex ID, but GLSL expects 0 to be returned. + */ + LLVMValueRef vs_state = ac_get_arg(&ctx->ac, + ctx->vs_state_bits); + LLVMValueRef indexed; + + indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, ""); + indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, ""); + + return LLVMBuildSelect(ctx->ac.builder, indexed, + ac_get_arg(&ctx->ac, ctx->args.base_vertex), + ctx->ac.i32_0, ""); +} + +void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader) +{ + struct si_shader *shader = ctx->shader; + + if (shader->key.as_ls) + ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; + else if (shader->key.as_es) + ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; + else if (shader->key.opt.vs_as_prim_discard_cs) + ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; + else if (ngg_cull_shader) + ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32; + else if (shader->key.as_ngg) + ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue; + else + ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue; + + ctx->abi.load_base_vertex = get_base_vertex; +} diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 243feba7417..6b5dd038407 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -997,59 +997,10 @@ void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize) si_lower_nir(sscreen, nir); } -static void declare_nir_input_vs(struct si_shader_context *ctx, - struct nir_variable *variable, - unsigned input_index, - LLVMValueRef out[4]) -{ - si_llvm_load_input_vs(ctx, input_index, out); -} - -static void bitcast_inputs(struct si_shader_context *ctx, - LLVMValueRef data[4], - unsigned input_idx) -{ - for (unsigned chan = 0; chan < 4; chan++) { - ctx->inputs[input_idx + chan] = - LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, ""); - } -} - bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) { - struct si_shader_info *info = &ctx->shader->selector->info; - if (nir->info.stage == MESA_SHADER_VERTEX) { - uint64_t processed_inputs = 0; - nir_foreach_variable(variable, &nir->inputs) { - unsigned attrib_count = glsl_count_attribute_slots(variable->type, - true); - unsigned input_idx = variable->data.driver_location; - - LLVMValueRef data[4]; - unsigned loc = variable->data.location; - - for (unsigned i = 0; i < attrib_count; i++) { - /* Packed components share the same location so skip - * them if we have already processed the location. - */ - if (processed_inputs & ((uint64_t)1 << (loc + i))) { - input_idx += 4; - continue; - } - - declare_nir_input_vs(ctx, variable, input_idx / 4, data); - bitcast_inputs(ctx, data, input_idx); - if (glsl_type_is_dual_slot(variable->type)) { - input_idx += 4; - declare_nir_input_vs(ctx, variable, input_idx / 4, data); - bitcast_inputs(ctx, data, input_idx); - } - - processed_inputs |= ((uint64_t)1 << (loc + i)); - input_idx += 4; - } - } + si_llvm_load_vs_inputs(ctx, nir); } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { unsigned colors_read = ctx->shader->selector->info.colors_read; -- 2.30.2