si_shader_llvm_ps.c \
si_shader_llvm_resources.c \
si_shader_llvm_tess.c \
+ si_shader_llvm_vs.c \
si_shader_nir.c \
si_shaderlib_tgsi.c \
si_state.c \
(info->output_streams[reg] >> (2 * comp)) & 3;
}
- si_emit_streamout_output(ctx, so_buffer, offset, &so->output[i], &out);
+ si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
}
}
i++;
}
- si_llvm_export_vs(ctx, outputs, i);
+ si_llvm_build_vs_exports(ctx, outputs, i);
}
ac_build_endif(&ctx->ac, 6002);
}
}
}
- si_llvm_export_vs(ctx, outputs, info->num_outputs);
+ si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
}
ac_build_endif(&ctx->ac, 5145);
}
'si_shader_llvm_ps.c',
'si_shader_llvm_resources.c',
'si_shader_llvm_tess.c',
+ 'si_shader_llvm_vs.c',
'si_shader_nir.c',
'si_shaderlib_tgsi.c',
'si_state.c',
static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
-static void si_build_vs_prolog_function(struct si_shader_context *ctx,
- union si_shader_part_key *key);
-
/** Whether the shader runs as a combination of multiple API shaders */
static bool is_multi_part_shader(struct si_shader_context *ctx)
{
return unpack_llvm_param(ctx, value, rshift, bitwidth);
}
-static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
- LLVMValueRef i32, unsigned index)
-{
- assert(index <= 1);
-
- if (index == 1)
- return LLVMBuildAShr(ctx->ac.builder, i32,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
-
- return LLVMBuildSExt(ctx->ac.builder,
- LLVMBuildTrunc(ctx->ac.builder, i32,
- ctx->ac.i16, ""),
- ctx->ac.i32, "");
-}
-
-void si_llvm_load_input_vs(
- struct si_shader_context *ctx,
- unsigned input_index,
- LLVMValueRef out[4])
-{
- const struct si_shader_info *info = &ctx->shader->selector->info;
- unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
-
- if (vs_blit_property) {
- LLVMValueRef vertex_id = ctx->abi.vertex_id;
- LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
- LLVMIntULE, vertex_id,
- ctx->ac.i32_1, "");
- /* Use LLVMIntNE, because we have 3 vertices and only
- * the middle one should use y2.
- */
- LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
- LLVMIntNE, vertex_id,
- ctx->ac.i32_1, "");
-
- unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
- if (input_index == 0) {
- /* Position: */
- LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs);
- LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 1);
-
- LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
- LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
- LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
- LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
-
- LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
- x1, x2, "");
- LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
- y1, y2, "");
-
- out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
- out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
- out[2] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 2);
- out[3] = ctx->ac.f32_1;
- return;
- }
-
- /* Color or texture coordinates: */
- assert(input_index == 1);
-
- if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
- for (int i = 0; i < 4; i++) {
- out[i] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 3 + i);
- }
- } else {
- assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
- LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 3);
- LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 4);
- LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 5);
- LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 6);
-
- out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
- x1, x2, "");
- out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
- y1, y2, "");
- out[2] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 7);
- out[3] = LLVMGetParam(ctx->main_fn,
- param_vs_blit_inputs + 8);
- }
- return;
- }
-
- unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
- union si_vs_fix_fetch fix_fetch;
- LLVMValueRef vb_desc;
- LLVMValueRef vertex_index;
- LLVMValueRef tmp;
-
- if (input_index < num_vbos_in_user_sgprs) {
- vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
- } else {
- unsigned index= input_index - num_vbos_in_user_sgprs;
- vb_desc = ac_build_load_to_sgpr(&ctx->ac,
- ac_get_arg(&ctx->ac, ctx->vertex_buffers),
- LLVMConstInt(ctx->ac.i32, index, 0));
- }
-
- vertex_index = LLVMGetParam(ctx->main_fn,
- ctx->vertex_index0.arg_index +
- input_index);
-
- /* Use the open-coded implementation for all loads of doubles and
- * of dword-sized data that needs fixups. We need to insert conversion
- * code anyway, and the amd/common code does it for us.
- *
- * Note: On LLVM <= 8, we can only open-code formats with
- * channel size >= 4 bytes.
- */
- bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
- fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
- if (opencode ||
- (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
- (fix_fetch.u.log_size == 2)) {
- tmp = ac_build_opencoded_load_format(
- &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
- fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
- vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
- for (unsigned i = 0; i < 4; ++i)
- out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
- return;
- }
-
- /* Do multiple loads for special formats. */
- unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
- LLVMValueRef fetches[4];
- unsigned num_fetches;
- unsigned fetch_stride;
- unsigned channels_per_fetch;
-
- if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
- num_fetches = MIN2(required_channels, 3);
- fetch_stride = 1 << fix_fetch.u.log_size;
- channels_per_fetch = 1;
- } else {
- num_fetches = 1;
- fetch_stride = 0;
- channels_per_fetch = required_channels;
- }
-
- for (unsigned i = 0; i < num_fetches; ++i) {
- LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
- fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
- channels_per_fetch, 0, true);
- }
-
- if (num_fetches == 1 && channels_per_fetch > 1) {
- LLVMValueRef fetch = fetches[0];
- for (unsigned i = 0; i < channels_per_fetch; ++i) {
- tmp = LLVMConstInt(ctx->ac.i32, i, false);
- fetches[i] = LLVMBuildExtractElement(
- ctx->ac.builder, fetch, tmp, "");
- }
- num_fetches = channels_per_fetch;
- channels_per_fetch = 1;
- }
-
- for (unsigned i = num_fetches; i < 4; ++i)
- fetches[i] = LLVMGetUndef(ctx->ac.f32);
-
- if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
- required_channels == 4) {
- if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
- fetches[3] = ctx->ac.i32_1;
- else
- fetches[3] = ctx->ac.f32_1;
- } else if (fix_fetch.u.log_size == 3 &&
- (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
- fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
- fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
- required_channels == 4) {
- /* For 2_10_10_10, the hardware returns an unsigned value;
- * convert it to a signed one.
- */
- LLVMValueRef tmp = fetches[3];
- LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
-
- /* First, recover the sign-extended signed integer value. */
- if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
- tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
- else
- tmp = ac_to_integer(&ctx->ac, tmp);
-
- /* For the integer-like cases, do a natural sign extension.
- *
- * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
- * and happen to contain 0, 1, 2, 3 as the two LSBs of the
- * exponent.
- */
- tmp = LLVMBuildShl(ctx->ac.builder, tmp,
- fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
- LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
- tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
-
- /* Convert back to the right type. */
- if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
- LLVMValueRef clamp;
- LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
- tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
- clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
- tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
- } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
- tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
- }
-
- fetches[3] = tmp;
- }
-
- for (unsigned i = 0; i < 4; ++i)
- out[i] = ac_to_float(&ctx->ac, fetches[i]);
-}
-
LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
unsigned swizzle)
{
}
}
-static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
-{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
- /* For non-indexed draws, the base vertex set by the driver
- * (for direct draws) or the CP (for indirect draws) is the
- * first vertex ID, but GLSL expects 0 to be returned.
- */
- LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
- ctx->vs_state_bits);
- LLVMValueRef indexed;
-
- indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
- indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
-
- return LLVMBuildSelect(ctx->ac.builder, indexed,
- ac_get_arg(&ctx->ac, ctx->args.base_vertex),
- ctx->ac.i32_0, "");
-}
-
static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
{
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
}
-/* Initialize arguments for the shader export intrinsic */
-static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
- LLVMValueRef *values,
- unsigned target,
- struct ac_export_args *args)
-{
- args->enabled_channels = 0xf; /* writemask - default is 0xf */
- args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
- args->done = 0; /* Specify whether this is the last export */
- args->target = target; /* Specify the target we are exporting */
- args->compr = false;
-
- memcpy(&args->out[0], values, sizeof(values[0]) * 4);
-}
-
-static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
- struct ac_export_args *pos, LLVMValueRef *out_elts)
-{
- unsigned reg_index;
- unsigned chan;
- unsigned const_chan;
- LLVMValueRef base_elt;
- LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
- SI_VS_CONST_CLIP_PLANES, 0);
- LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
-
- for (reg_index = 0; reg_index < 2; reg_index ++) {
- struct ac_export_args *args = &pos[2 + reg_index];
-
- args->out[0] =
- args->out[1] =
- args->out[2] =
- args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
-
- /* Compute dot products of position and user clip plane vectors */
- for (chan = 0; chan < 4; chan++) {
- for (const_chan = 0; const_chan < 4; const_chan++) {
- LLVMValueRef addr =
- LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
- const_chan) * 4, 0);
- base_elt = si_buffer_load_const(ctx, const_resource,
- addr);
- args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
- out_elts[const_chan], args->out[chan]);
- }
- }
-
- args->enabled_channels = 0xf;
- args->valid_mask = 0;
- args->done = 0;
- args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
- args->compr = 0;
- }
-}
-
static void si_dump_streamout(struct pipe_stream_output_info *so)
{
unsigned i;
}
}
-void si_emit_streamout_output(struct si_shader_context *ctx,
- LLVMValueRef const *so_buffers,
- LLVMValueRef const *so_write_offsets,
- struct pipe_stream_output *stream_out,
- struct si_shader_output_values *shader_out)
-{
- unsigned buf_idx = stream_out->output_buffer;
- unsigned start = stream_out->start_component;
- unsigned num_comps = stream_out->num_components;
- LLVMValueRef out[4];
-
- assert(num_comps && num_comps <= 4);
- if (!num_comps || num_comps > 4)
- return;
-
- /* Load the output as int. */
- for (int j = 0; j < num_comps; j++) {
- assert(stream_out->stream == shader_out->vertex_stream[start + j]);
-
- out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
- }
-
- /* Pack the output. */
- LLVMValueRef vdata = NULL;
-
- switch (num_comps) {
- case 1: /* as i32 */
- vdata = out[0];
- break;
- case 2: /* as v2i32 */
- case 3: /* as v3i32 */
- if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
- vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
- break;
- }
- /* as v4i32 (aligned to 4) */
- out[3] = LLVMGetUndef(ctx->ac.i32);
- /* fall through */
- case 4: /* as v4i32 */
- vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
- break;
- }
-
- ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
- vdata, num_comps,
- so_write_offsets[buf_idx],
- ctx->ac.i32_0,
- stream_out->dst_offset * 4, ac_glc | ac_slc);
-}
-
-/**
- * Write streamout data to buffers for vertex stream @p stream (different
- * vertex streams can occur for GS copy shaders).
- */
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput, unsigned stream)
-{
- struct si_shader_selector *sel = ctx->shader->selector;
- struct pipe_stream_output_info *so = &sel->so;
- LLVMBuilderRef builder = ctx->ac.builder;
- int i;
-
- /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
- LLVMValueRef so_vtx_count =
- si_unpack_param(ctx, ctx->streamout_config, 16, 7);
-
- LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
-
- /* can_emit = tid < so_vtx_count; */
- LLVMValueRef can_emit =
- LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
-
- /* Emit the streamout code conditionally. This actually avoids
- * out-of-bounds buffer access. The hw tells us via the SGPR
- * (so_vtx_count) which threads are allowed to emit streamout data. */
- ac_build_ifcc(&ctx->ac, can_emit, 6501);
- {
- /* The buffer offset is computed as follows:
- * ByteOffset = streamout_offset[buffer_id]*4 +
- * (streamout_write_index + thread_id)*stride[buffer_id] +
- * attrib_offset
- */
-
- LLVMValueRef so_write_index =
- ac_get_arg(&ctx->ac,
- ctx->streamout_write_index);
-
- /* Compute (streamout_write_index + thread_id). */
- so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
-
- /* Load the descriptor and compute the write offset for each
- * enabled buffer. */
- LLVMValueRef so_write_offset[4] = {};
- LLVMValueRef so_buffers[4];
- LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
- ctx->rw_buffers);
-
- for (i = 0; i < 4; i++) {
- if (!so->stride[i])
- continue;
-
- LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
- SI_VS_STREAMOUT_BUF0 + i, 0);
-
- so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-
- LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
- ctx->streamout_offset[i]);
- so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
-
- so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
- LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
- so_offset);
- }
-
- /* Write streamout data. */
- for (i = 0; i < so->num_outputs; i++) {
- unsigned reg = so->output[i].register_index;
-
- if (reg >= noutput)
- continue;
-
- if (stream != so->output[i].stream)
- continue;
-
- si_emit_streamout_output(ctx, so_buffers, so_write_offset,
- &so->output[i], &outputs[reg]);
- }
- }
- ac_build_endif(&ctx->ac, 6501);
-}
-
-static void si_export_param(struct si_shader_context *ctx, unsigned index,
- LLVMValueRef *values)
-{
- struct ac_export_args args;
-
- si_llvm_init_vs_export_args(ctx, values,
- V_008DFC_SQ_EXP_PARAM + index, &args);
- ac_build_export(&ctx->ac, &args);
-}
-
-static void si_build_param_exports(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput)
-{
- struct si_shader *shader = ctx->shader;
- unsigned param_count = 0;
-
- for (unsigned i = 0; i < noutput; i++) {
- unsigned semantic_name = outputs[i].semantic_name;
- unsigned semantic_index = outputs[i].semantic_index;
-
- if (outputs[i].vertex_stream[0] != 0 &&
- outputs[i].vertex_stream[1] != 0 &&
- outputs[i].vertex_stream[2] != 0 &&
- outputs[i].vertex_stream[3] != 0)
- continue;
-
- switch (semantic_name) {
- case TGSI_SEMANTIC_LAYER:
- case TGSI_SEMANTIC_VIEWPORT_INDEX:
- case TGSI_SEMANTIC_CLIPDIST:
- case TGSI_SEMANTIC_COLOR:
- case TGSI_SEMANTIC_BCOLOR:
- case TGSI_SEMANTIC_PRIMID:
- case TGSI_SEMANTIC_FOG:
- case TGSI_SEMANTIC_TEXCOORD:
- case TGSI_SEMANTIC_GENERIC:
- break;
- default:
- continue;
- }
-
- if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
- semantic_index < SI_MAX_IO_GENERIC) &&
- shader->key.opt.kill_outputs &
- (1ull << si_shader_io_get_unique_index(semantic_name,
- semantic_index, true)))
- continue;
-
- si_export_param(ctx, param_count, outputs[i].values);
-
- assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
- shader->info.vs_output_param_offset[i] = param_count++;
- }
-
- shader->info.nr_param_exports = param_count;
-}
-
-/**
- * Vertex color clamping.
- *
- * This uses a state constant loaded in a user data SGPR and
- * an IF statement is added that clamps all colors if the constant
- * is true.
- */
-static void si_vertex_color_clamping(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput)
-{
- LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
- bool has_colors = false;
-
- /* Store original colors to alloca variables. */
- for (unsigned i = 0; i < noutput; i++) {
- if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
- outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
- LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
- }
- has_colors = true;
- }
-
- if (!has_colors)
- return;
-
- /* The state is in the first bit of the user SGPR. */
- LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
- cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
-
- ac_build_ifcc(&ctx->ac, cond, 6502);
-
- /* Store clamped colors to alloca variables within the conditional block. */
- for (unsigned i = 0; i < noutput; i++) {
- if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
- outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- LLVMBuildStore(ctx->ac.builder,
- ac_build_clamp(&ctx->ac, outputs[i].values[j]),
- addr[i][j]);
- }
- }
- ac_build_endif(&ctx->ac, 6502);
-
- /* Load clamped colors */
- for (unsigned i = 0; i < noutput; i++) {
- if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
- outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- outputs[i].values[j] =
- LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
- }
- }
-}
-
-/* Generate export instructions for hardware VS shader stage or NGG GS stage
- * (position and parameter data only).
- */
-void si_llvm_export_vs(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput)
-{
- struct si_shader *shader = ctx->shader;
- struct ac_export_args pos_args[4] = {};
- LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
- unsigned pos_idx;
- int i;
-
- si_vertex_color_clamping(ctx, outputs, noutput);
-
- /* Build position exports. */
- for (i = 0; i < noutput; i++) {
- switch (outputs[i].semantic_name) {
- case TGSI_SEMANTIC_POSITION:
- si_llvm_init_vs_export_args(ctx, outputs[i].values,
- V_008DFC_SQ_EXP_POS, &pos_args[0]);
- break;
- case TGSI_SEMANTIC_PSIZE:
- psize_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_LAYER:
- layer_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_VIEWPORT_INDEX:
- viewport_index_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_EDGEFLAG:
- edgeflag_value = outputs[i].values[0];
- break;
- case TGSI_SEMANTIC_CLIPDIST:
- if (!shader->key.opt.clip_disable) {
- unsigned index = 2 + outputs[i].semantic_index;
- si_llvm_init_vs_export_args(ctx, outputs[i].values,
- V_008DFC_SQ_EXP_POS + index,
- &pos_args[index]);
- }
- break;
- case TGSI_SEMANTIC_CLIPVERTEX:
- if (!shader->key.opt.clip_disable) {
- si_llvm_emit_clipvertex(ctx, pos_args,
- outputs[i].values);
- }
- break;
- }
- }
-
- /* We need to add the position output manually if it's missing. */
- if (!pos_args[0].out[0]) {
- pos_args[0].enabled_channels = 0xf; /* writemask */
- pos_args[0].valid_mask = 0; /* EXEC mask */
- pos_args[0].done = 0; /* last export? */
- pos_args[0].target = V_008DFC_SQ_EXP_POS;
- pos_args[0].compr = 0; /* COMPR flag */
- pos_args[0].out[0] = ctx->ac.f32_0; /* X */
- pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
- pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
- pos_args[0].out[3] = ctx->ac.f32_1; /* W */
- }
-
- bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
- !shader->key.as_ngg;
-
- /* Write the misc vector (point size, edgeflag, layer, viewport). */
- if (shader->selector->info.writes_psize ||
- pos_writes_edgeflag ||
- shader->selector->info.writes_viewport_index ||
- shader->selector->info.writes_layer) {
- pos_args[1].enabled_channels = shader->selector->info.writes_psize |
- (pos_writes_edgeflag << 1) |
- (shader->selector->info.writes_layer << 2);
-
- pos_args[1].valid_mask = 0; /* EXEC mask */
- pos_args[1].done = 0; /* last export? */
- pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
- pos_args[1].compr = 0; /* COMPR flag */
- pos_args[1].out[0] = ctx->ac.f32_0; /* X */
- pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
- pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
- pos_args[1].out[3] = ctx->ac.f32_0; /* W */
-
- if (shader->selector->info.writes_psize)
- pos_args[1].out[0] = psize_value;
-
- if (pos_writes_edgeflag) {
- /* The output is a float, but the hw expects an integer
- * with the first bit containing the edge flag. */
- edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
- edgeflag_value,
- ctx->ac.i32, "");
- edgeflag_value = ac_build_umin(&ctx->ac,
- edgeflag_value,
- ctx->ac.i32_1);
-
- /* The LLVM intrinsic expects a float. */
- pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
- }
-
- if (ctx->screen->info.chip_class >= GFX9) {
- /* GFX9 has the layer in out.z[10:0] and the viewport
- * index in out.z[19:16].
- */
- if (shader->selector->info.writes_layer)
- pos_args[1].out[2] = layer_value;
-
- if (shader->selector->info.writes_viewport_index) {
- LLVMValueRef v = viewport_index_value;
-
- v = ac_to_integer(&ctx->ac, v);
- v = LLVMBuildShl(ctx->ac.builder, v,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
- v = LLVMBuildOr(ctx->ac.builder, v,
- ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
- pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
- pos_args[1].enabled_channels |= 1 << 2;
- }
- } else {
- if (shader->selector->info.writes_layer)
- pos_args[1].out[2] = layer_value;
-
- if (shader->selector->info.writes_viewport_index) {
- pos_args[1].out[3] = viewport_index_value;
- pos_args[1].enabled_channels |= 1 << 3;
- }
- }
- }
-
- for (i = 0; i < 4; i++)
- if (pos_args[i].out[0])
- shader->info.nr_pos_exports++;
-
- /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
- * Setting valid_mask=1 prevents it and has no other effect.
- */
- if (ctx->screen->info.family == CHIP_NAVI10 ||
- ctx->screen->info.family == CHIP_NAVI12 ||
- ctx->screen->info.family == CHIP_NAVI14)
- pos_args[0].valid_mask = 1;
-
- pos_idx = 0;
- for (i = 0; i < 4; i++) {
- if (!pos_args[i].out[0])
- continue;
-
- /* Specify the target we are exporting */
- pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
-
- if (pos_idx == shader->info.nr_pos_exports)
- /* Specify that this is the last export */
- pos_args[i].done = 1;
-
- ac_build_export(&ctx->ac, &pos_args[i]);
- }
-
- /* Build parameter exports. */
- si_build_param_exports(ctx, outputs, noutput);
-}
-
-static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
-{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info *info = &ctx->shader->selector->info;
- struct si_shader_output_values *outputs = NULL;
- int i,j;
-
- assert(!ctx->shader->is_gs_copy_shader);
- assert(info->num_outputs <= max_outputs);
-
- outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
-
- for (i = 0; i < info->num_outputs; i++) {
- outputs[i].semantic_name = info->output_semantic_name[i];
- outputs[i].semantic_index = info->output_semantic_index[i];
-
- for (j = 0; j < 4; j++) {
- outputs[i].values[j] =
- LLVMBuildLoad(ctx->ac.builder,
- addrs[4 * i + j],
- "");
- outputs[i].vertex_stream[j] =
- (info->output_streams[i] >> (2 * j)) & 3;
- }
- }
-
- if (!ctx->screen->use_ngg_streamout &&
- ctx->shader->selector->so.num_outputs)
- si_llvm_emit_streamout(ctx, outputs, i, 0);
-
- /* Export PrimitiveID. */
- if (ctx->shader->key.mono.u.vs_export_prim_id) {
- outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
- outputs[i].semantic_index = 0;
- outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
- for (j = 1; j < 4; j++)
- outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
-
- memset(outputs[i].vertex_stream, 0,
- sizeof(outputs[i].vertex_stream));
- i++;
- }
-
- si_llvm_export_vs(ctx, outputs, i);
- FREE(outputs);
-}
-
-static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
- unsigned max_outputs,
- LLVMValueRef *addrs)
-{
- struct si_shader_context *ctx = si_shader_context_from_abi(abi);
- struct si_shader_info *info = &ctx->shader->selector->info;
- LLVMValueRef pos[4] = {};
-
- assert(info->num_outputs <= max_outputs);
-
- for (unsigned i = 0; i < info->num_outputs; i++) {
- if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
- continue;
-
- for (unsigned chan = 0; chan < 4; chan++)
- pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
- break;
- }
- assert(pos[0] != NULL);
-
- /* Return the position output. */
- LLVMValueRef ret = ctx->return_value;
- for (unsigned chan = 0; chan < 4; chan++)
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
- ctx->return_value = ret;
-}
-
static void declare_streamout_params(struct si_shader_context *ctx,
struct pipe_stream_output_info *so)
{
&shader->info.nr_param_exports);
}
-static void si_init_exec_from_input(struct si_shader_context *ctx,
- struct ac_arg param, unsigned bitoffset)
-{
- LLVMValueRef args[] = {
- ac_get_arg(&ctx->ac, param),
- LLVMConstInt(ctx->ac.i32, bitoffset, 0),
- };
- ac_build_intrinsic(&ctx->ac,
- "llvm.amdgcn.init.exec.from.input",
- ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
-}
-
static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
const struct si_vs_prolog_bits *prolog_key,
const struct si_shader_key *key,
switch (ctx->type) {
case PIPE_SHADER_VERTEX:
- if (shader->key.as_ls)
- ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
- else if (shader->key.as_es)
- ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
- else if (shader->key.opt.vs_as_prim_discard_cs)
- ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
- else if (ngg_cull_shader)
- ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
- else if (shader->key.as_ngg)
- ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
- else
- ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
- ctx->abi.load_base_vertex = get_base_vertex;
+ si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
break;
case PIPE_SHADER_TESS_CTRL:
si_llvm_init_tcs_callbacks(ctx);
break;
case PIPE_SHADER_TESS_EVAL:
- si_llvm_init_tes_callbacks(ctx);
-
- if (shader->key.as_es)
- ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
- else if (ngg_cull_shader)
- ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
- else if (shader->key.as_ngg)
- ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
- else
- ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+ si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
break;
case PIPE_SHADER_GEOMETRY:
si_llvm_init_gs_callbacks(ctx);
&shader->key.part.vs.prolog,
shader, &prolog_key);
prolog_key.vs_prolog.is_monolithic = true;
- si_build_vs_prolog_function(&ctx, &prolog_key);
+ si_llvm_build_vs_prolog(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
has_prolog = true;
}
&shader->key.part.vs.prolog,
shader, &prolog_key);
prolog_key.vs_prolog.is_monolithic = true;
- si_build_vs_prolog_function(&ctx, &prolog_key);
+ si_llvm_build_vs_prolog(&ctx, &prolog_key);
parts[num_parts++] = ctx.main_fn;
has_prolog = true;
}
&shader->key.part.tcs.ls_prolog,
shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
- si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+ si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
parts[0] = ctx.main_fn;
}
&shader->key.part.gs.vs_prolog,
shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
- si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+ si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
es_prolog = ctx.main_fn;
}
return result;
}
-/**
- * Build the vertex shader prolog function.
- *
- * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
- * All inputs are returned unmodified. The vertex load indices are
- * stored after them, which will be used by the API VS for fetching inputs.
- *
- * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
- * input_v0,
- * input_v1,
- * input_v2,
- * input_v3,
- * (VertexID + BaseVertex),
- * (InstanceID + StartInstance),
- * (InstanceID / 2 + StartInstance)
- */
-static void si_build_vs_prolog_function(struct si_shader_context *ctx,
- union si_shader_part_key *key)
-{
- LLVMTypeRef *returns;
- LLVMValueRef ret, func;
- int num_returns, i;
- unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
- unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 +
- (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
- struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
- struct ac_arg input_vgpr_param[13];
- LLVMValueRef input_vgprs[13];
- unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
- num_input_vgprs;
- unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
-
- memset(&ctx->args, 0, sizeof(ctx->args));
-
- /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
- returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
- sizeof(LLVMTypeRef));
- num_returns = 0;
-
- /* Declare input and output SGPRs. */
- for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
- &input_sgpr_param[i]);
- returns[num_returns++] = ctx->ac.i32;
- }
-
- struct ac_arg merged_wave_info = input_sgpr_param[3];
-
- /* Preloaded VGPRs (outputs must be floats) */
- for (i = 0; i < num_input_vgprs; i++) {
- ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
- returns[num_returns++] = ctx->ac.f32;
- }
-
- /* Vertex load indices. */
- for (i = 0; i < key->vs_prolog.num_inputs; i++)
- returns[num_returns++] = ctx->ac.f32;
-
- /* Create the function. */
- si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
- func = ctx->main_fn;
-
- for (i = 0; i < num_input_vgprs; i++) {
- input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
- }
-
- if (key->vs_prolog.num_merged_next_stage_vgprs) {
- if (!key->vs_prolog.is_monolithic)
- si_init_exec_from_input(ctx, merged_wave_info, 0);
-
- if (key->vs_prolog.as_ls &&
- ctx->screen->info.has_ls_vgpr_init_bug) {
- /* If there are no HS threads, SPI loads the LS VGPRs
- * starting at VGPR 0. Shift them back to where they
- * belong.
- */
- LLVMValueRef has_hs_threads =
- LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
- si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
- ctx->ac.i32_0, "");
-
- for (i = 4; i > 0; --i) {
- input_vgprs[i + 1] =
- LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
- input_vgprs[i + 1],
- input_vgprs[i - 1], "");
- }
- }
- }
-
- if (key->vs_prolog.gs_fast_launch_tri_list ||
- key->vs_prolog.gs_fast_launch_tri_strip) {
- LLVMValueRef wave_id, thread_id_in_tg;
-
- wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
- thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
- LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
- ac_get_thread_id(&ctx->ac));
-
- /* The GS fast launch initializes all VGPRs to the value of
- * the first thread, so we have to add the thread ID.
- *
- * Only these are initialized by the hw:
- * VGPR2: Base Primitive ID
- * VGPR5: Base Vertex ID
- * VGPR6: Instance ID
- */
-
- /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
- * The NGG cull shader will read them from there.
- */
- if (key->vs_prolog.gs_fast_launch_tri_list) {
- input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
- LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
- LLVMConstInt(ctx->ac.i32, 0, 0));
- input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
- LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
- LLVMConstInt(ctx->ac.i32, 1, 0));
- input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
- LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
- LLVMConstInt(ctx->ac.i32, 2, 0));
- } else {
- assert(key->vs_prolog.gs_fast_launch_tri_strip);
- LLVMBuilderRef builder = ctx->ac.builder;
- /* Triangle indices: */
- LLVMValueRef index[3] = {
- thread_id_in_tg,
- LLVMBuildAdd(builder, thread_id_in_tg,
- LLVMConstInt(ctx->ac.i32, 1, 0), ""),
- LLVMBuildAdd(builder, thread_id_in_tg,
- LLVMConstInt(ctx->ac.i32, 2, 0), ""),
- };
- LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
- thread_id_in_tg, ctx->ac.i1, "");
- LLVMValueRef flatshade_first =
- LLVMBuildICmp(builder, LLVMIntEQ,
- si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
- ctx->ac.i32_0, "");
-
- ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
- flatshade_first, index);
- input_vgprs[0] = index[0];
- input_vgprs[1] = index[1];
- input_vgprs[4] = index[2];
- }
-
- /* Triangles always have all edge flags set initially. */
- input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
-
- input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
- thread_id_in_tg, ""); /* PrimID */
- input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
- thread_id_in_tg, ""); /* VertexID */
- input_vgprs[8] = input_vgprs[6]; /* InstanceID */
- }
-
- unsigned vertex_id_vgpr = first_vs_vgpr;
- unsigned instance_id_vgpr =
- ctx->screen->info.chip_class >= GFX10 ?
- first_vs_vgpr + 3 :
- first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
-
- ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
- ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
-
- /* InstanceID = VertexID >> 16;
- * VertexID = VertexID & 0xffff;
- */
- if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
- ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
- LLVMConstInt(ctx->ac.i32, 16, 0), "");
- ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
- LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
- }
-
- /* Copy inputs to outputs. This should be no-op, as the registers match,
- * but it will prevent the compiler from overwriting them unintentionally.
- */
- ret = ctx->return_value;
- for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
- LLVMValueRef p = LLVMGetParam(func, i);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
- }
- for (i = 0; i < num_input_vgprs; i++) {
- LLVMValueRef p = input_vgprs[i];
-
- if (i == vertex_id_vgpr)
- p = ctx->abi.vertex_id;
- else if (i == instance_id_vgpr)
- p = ctx->abi.instance_id;
-
- p = ac_to_float(&ctx->ac, p);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
- key->vs_prolog.num_input_sgprs + i, "");
- }
-
- /* Compute vertex load indices from instance divisors. */
- LLVMValueRef instance_divisor_constbuf = NULL;
-
- if (key->vs_prolog.states.instance_divisor_is_fetched) {
- LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
- LLVMValueRef buf_index =
- LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
- instance_divisor_constbuf =
- ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
- }
-
- for (i = 0; i < key->vs_prolog.num_inputs; i++) {
- bool divisor_is_one =
- key->vs_prolog.states.instance_divisor_is_one & (1u << i);
- bool divisor_is_fetched =
- key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
- LLVMValueRef index = NULL;
-
- if (divisor_is_one) {
- index = ctx->abi.instance_id;
- } else if (divisor_is_fetched) {
- LLVMValueRef udiv_factors[4];
-
- for (unsigned j = 0; j < 4; j++) {
- udiv_factors[j] =
- si_buffer_load_const(ctx, instance_divisor_constbuf,
- LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
- udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
- }
- /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
- * Such InstanceID might not be achievable in a reasonable time though.
- */
- index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
- udiv_factors[0], udiv_factors[1],
- udiv_factors[2], udiv_factors[3]);
- }
-
- if (divisor_is_one || divisor_is_fetched) {
- /* Add StartInstance. */
- index = LLVMBuildAdd(ctx->ac.builder, index,
- LLVMGetParam(ctx->main_fn, user_sgpr_base +
- SI_SGPR_START_INSTANCE), "");
- } else {
- /* VertexID + BaseVertex */
- index = LLVMBuildAdd(ctx->ac.builder,
- ctx->abi.vertex_id,
- LLVMGetParam(func, user_sgpr_base +
- SI_SGPR_BASE_VERTEX), "");
- }
-
- index = ac_to_float(&ctx->ac, index);
- ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
- ctx->args.arg_count + i, "");
- }
-
- si_llvm_build_ret(ctx, ret);
-}
-
static bool si_get_vs_prolog(struct si_screen *sscreen,
struct ac_llvm_compiler *compiler,
struct si_shader *shader,
shader->prolog =
si_get_shader_part(sscreen, &sscreen->vs_prologs,
PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
- debug, si_build_vs_prolog_function,
+ debug, si_llvm_build_vs_prolog,
"Vertex Shader Prolog");
return shader->prolog != NULL;
}
LLVMValueRef val2);
void si_llvm_emit_barrier(struct si_shader_context *ctx);
void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
+void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
+ unsigned bitoffset);
void si_declare_compute_memory(struct si_shader_context *ctx);
LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
unsigned swizzle);
-void si_llvm_export_vs(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput);
-void si_emit_streamout_output(struct si_shader_context *ctx,
- LLVMValueRef const *so_buffers,
- LLVMValueRef const *so_write_offsets,
- struct pipe_stream_output *stream_out,
- struct si_shader_output_values *shader_out);
void si_add_arg_checked(struct ac_shader_args *args,
enum ac_arg_regfile file,
unsigned registers, enum ac_arg_type type,
struct ac_arg *arg,
unsigned idx);
-
-void si_llvm_load_input_vs(
- struct si_shader_context *ctx,
- unsigned input_index,
- LLVMValueRef out[4]);
-
bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
const char *name,
bool less_optimized);
void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
-void si_llvm_emit_streamout(struct si_shader_context *ctx,
- struct si_shader_output_values *outputs,
- unsigned noutput, unsigned stream);
void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader);
bool gfx10_ngg_export_prim_early(struct si_shader *shader);
void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
union si_shader_part_key *key);
void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
-void si_llvm_init_tes_callbacks(struct si_shader_context *ctx);
+void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
/* si_shader_llvm_ps.c */
void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
/* si_shader_llvm_resources.c */
void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
+/* si_shader_llvm_vs.c */
+void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
+void si_llvm_streamout_store_output(struct si_shader_context *ctx,
+ LLVMValueRef const *so_buffers,
+ LLVMValueRef const *so_write_offsets,
+ struct pipe_stream_output *stream_out,
+ struct si_shader_output_values *shader_out);
+void si_llvm_emit_streamout(struct si_shader_context *ctx,
+ struct si_shader_output_values *outputs,
+ unsigned noutput, unsigned stream);
+void si_llvm_build_vs_exports(struct si_shader_context *ctx,
+ struct si_shader_output_values *outputs,
+ unsigned noutput);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs);
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
+ union si_shader_part_key *key);
+void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
+
#endif
LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
}
+
+void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
+ unsigned bitoffset)
+{
+ LLVMValueRef args[] = {
+ ac_get_arg(&ctx->ac, param),
+ LLVMConstInt(ctx->ac.i32, bitoffset, 0),
+ };
+ ac_build_intrinsic(&ctx->ac,
+ "llvm.amdgcn.init.exec.from.input",
+ ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
+}
}
if (stream == 0)
- si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
+ si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
LLVMBuildBr(builder, end_bb);
}
ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
}
-void si_llvm_init_tes_callbacks(struct si_shader_context *ctx)
+void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
{
ctx->abi.load_tess_varyings = si_nir_load_input_tes;
ctx->abi.load_tess_coord = si_load_tess_coord;
ctx->abi.load_tess_level = si_load_tess_level;
ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
+
+ if (ctx->shader->key.as_es)
+ ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+ else if (ctx->shader->key.as_ngg)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+ else
+ ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
}
--- /dev/null
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_shader_internal.h"
+#include "si_pipe.h"
+#include "sid.h"
+#include "util/u_memory.h"
+
+static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
+ LLVMValueRef i32, unsigned index)
+{
+ assert(index <= 1);
+
+ if (index == 1)
+ return LLVMBuildAShr(ctx->ac.builder, i32,
+ LLVMConstInt(ctx->ac.i32, 16, 0), "");
+
+ return LLVMBuildSExt(ctx->ac.builder,
+ LLVMBuildTrunc(ctx->ac.builder, i32,
+ ctx->ac.i16, ""),
+ ctx->ac.i32, "");
+}
+
+static void load_input_vs(struct si_shader_context *ctx, unsigned input_index,
+ LLVMValueRef out[4])
+{
+ const struct si_shader_info *info = &ctx->shader->selector->info;
+ unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
+
+ if (vs_blit_property) {
+ LLVMValueRef vertex_id = ctx->abi.vertex_id;
+ LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
+ LLVMIntULE, vertex_id,
+ ctx->ac.i32_1, "");
+ /* Use LLVMIntNE, because we have 3 vertices and only
+ * the middle one should use y2.
+ */
+ LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
+ LLVMIntNE, vertex_id,
+ ctx->ac.i32_1, "");
+
+ unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
+ if (input_index == 0) {
+ /* Position: */
+ LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs);
+ LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 1);
+
+ LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
+ LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
+ LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
+ LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
+
+ LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
+ x1, x2, "");
+ LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
+ y1, y2, "");
+
+ out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
+ out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
+ out[2] = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 2);
+ out[3] = ctx->ac.f32_1;
+ return;
+ }
+
+ /* Color or texture coordinates: */
+ assert(input_index == 1);
+
+ if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
+ for (int i = 0; i < 4; i++) {
+ out[i] = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 3 + i);
+ }
+ } else {
+ assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
+ LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 3);
+ LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 4);
+ LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 5);
+ LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 6);
+
+ out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
+ x1, x2, "");
+ out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
+ y1, y2, "");
+ out[2] = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 7);
+ out[3] = LLVMGetParam(ctx->main_fn,
+ param_vs_blit_inputs + 8);
+ }
+ return;
+ }
+
+ unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
+ union si_vs_fix_fetch fix_fetch;
+ LLVMValueRef vb_desc;
+ LLVMValueRef vertex_index;
+ LLVMValueRef tmp;
+
+ if (input_index < num_vbos_in_user_sgprs) {
+ vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
+ } else {
+ unsigned index= input_index - num_vbos_in_user_sgprs;
+ vb_desc = ac_build_load_to_sgpr(&ctx->ac,
+ ac_get_arg(&ctx->ac, ctx->vertex_buffers),
+ LLVMConstInt(ctx->ac.i32, index, 0));
+ }
+
+ vertex_index = LLVMGetParam(ctx->main_fn,
+ ctx->vertex_index0.arg_index +
+ input_index);
+
+ /* Use the open-coded implementation for all loads of doubles and
+ * of dword-sized data that needs fixups. We need to insert conversion
+ * code anyway, and the amd/common code does it for us.
+ *
+ * Note: On LLVM <= 8, we can only open-code formats with
+ * channel size >= 4 bytes.
+ */
+ bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
+ fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+ if (opencode ||
+ (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
+ (fix_fetch.u.log_size == 2)) {
+ tmp = ac_build_opencoded_load_format(
+ &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
+ fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
+ vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
+ for (unsigned i = 0; i < 4; ++i)
+ out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
+ return;
+ }
+
+ /* Do multiple loads for special formats. */
+ unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+ LLVMValueRef fetches[4];
+ unsigned num_fetches;
+ unsigned fetch_stride;
+ unsigned channels_per_fetch;
+
+ if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
+ num_fetches = MIN2(required_channels, 3);
+ fetch_stride = 1 << fix_fetch.u.log_size;
+ channels_per_fetch = 1;
+ } else {
+ num_fetches = 1;
+ fetch_stride = 0;
+ channels_per_fetch = required_channels;
+ }
+
+ for (unsigned i = 0; i < num_fetches; ++i) {
+ LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
+ fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
+ channels_per_fetch, 0, true);
+ }
+
+ if (num_fetches == 1 && channels_per_fetch > 1) {
+ LLVMValueRef fetch = fetches[0];
+ for (unsigned i = 0; i < channels_per_fetch; ++i) {
+ tmp = LLVMConstInt(ctx->ac.i32, i, false);
+ fetches[i] = LLVMBuildExtractElement(
+ ctx->ac.builder, fetch, tmp, "");
+ }
+ num_fetches = channels_per_fetch;
+ channels_per_fetch = 1;
+ }
+
+ for (unsigned i = num_fetches; i < 4; ++i)
+ fetches[i] = LLVMGetUndef(ctx->ac.f32);
+
+ if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
+ required_channels == 4) {
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
+ fetches[3] = ctx->ac.i32_1;
+ else
+ fetches[3] = ctx->ac.f32_1;
+ } else if (fix_fetch.u.log_size == 3 &&
+ (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
+ fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
+ fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
+ required_channels == 4) {
+ /* For 2_10_10_10, the hardware returns an unsigned value;
+ * convert it to a signed one.
+ */
+ LLVMValueRef tmp = fetches[3];
+ LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
+
+ /* First, recover the sign-extended signed integer value. */
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
+ tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
+ else
+ tmp = ac_to_integer(&ctx->ac, tmp);
+
+ /* For the integer-like cases, do a natural sign extension.
+ *
+ * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+ * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+ * exponent.
+ */
+ tmp = LLVMBuildShl(ctx->ac.builder, tmp,
+ fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
+ LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
+ tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
+
+ /* Convert back to the right type. */
+ if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
+ LLVMValueRef clamp;
+ LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
+ tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+ clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
+ tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
+ } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
+ tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
+ }
+
+ fetches[3] = tmp;
+ }
+
+ for (unsigned i = 0; i < 4; ++i)
+ out[i] = ac_to_float(&ctx->ac, fetches[i]);
+}
+
+static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index)
+{
+ LLVMValueRef input[4];
+
+ load_input_vs(ctx, input_index / 4, input);
+
+ for (unsigned chan = 0; chan < 4; chan++) {
+ ctx->inputs[input_index + chan] =
+ LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
+ }
+}
+
+void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
+{
+ uint64_t processed_inputs = 0;
+
+ nir_foreach_variable(variable, &nir->inputs) {
+ unsigned attrib_count = glsl_count_attribute_slots(variable->type,
+ true);
+ unsigned input_idx = variable->data.driver_location;
+ unsigned loc = variable->data.location;
+
+ for (unsigned i = 0; i < attrib_count; i++) {
+ /* Packed components share the same location so skip
+ * them if we have already processed the location.
+ */
+ if (processed_inputs & ((uint64_t)1 << (loc + i))) {
+ input_idx += 4;
+ continue;
+ }
+
+ declare_input_vs(ctx, input_idx);
+ if (glsl_type_is_dual_slot(variable->type)) {
+ input_idx += 4;
+ declare_input_vs(ctx, input_idx);
+ }
+
+ processed_inputs |= ((uint64_t)1 << (loc + i));
+ input_idx += 4;
+ }
+ }
+}
+
+void si_llvm_streamout_store_output(struct si_shader_context *ctx,
+ LLVMValueRef const *so_buffers,
+ LLVMValueRef const *so_write_offsets,
+ struct pipe_stream_output *stream_out,
+ struct si_shader_output_values *shader_out)
+{
+ unsigned buf_idx = stream_out->output_buffer;
+ unsigned start = stream_out->start_component;
+ unsigned num_comps = stream_out->num_components;
+ LLVMValueRef out[4];
+
+ assert(num_comps && num_comps <= 4);
+ if (!num_comps || num_comps > 4)
+ return;
+
+ /* Load the output as int. */
+ for (int j = 0; j < num_comps; j++) {
+ assert(stream_out->stream == shader_out->vertex_stream[start + j]);
+
+ out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
+ }
+
+ /* Pack the output. */
+ LLVMValueRef vdata = NULL;
+
+ switch (num_comps) {
+ case 1: /* as i32 */
+ vdata = out[0];
+ break;
+ case 2: /* as v2i32 */
+ case 3: /* as v3i32 */
+ if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
+ vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
+ break;
+ }
+ /* as v4i32 (aligned to 4) */
+ out[3] = LLVMGetUndef(ctx->ac.i32);
+ /* fall through */
+ case 4: /* as v4i32 */
+ vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
+ break;
+ }
+
+ ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
+ vdata, num_comps,
+ so_write_offsets[buf_idx],
+ ctx->ac.i32_0,
+ stream_out->dst_offset * 4, ac_glc | ac_slc);
+}
+
+/**
+ * Write streamout data to buffers for vertex stream @p stream (different
+ * vertex streams can occur for GS copy shaders).
+ */
+void si_llvm_emit_streamout(struct si_shader_context *ctx,
+ struct si_shader_output_values *outputs,
+ unsigned noutput, unsigned stream)
+{
+ struct si_shader_selector *sel = ctx->shader->selector;
+ struct pipe_stream_output_info *so = &sel->so;
+ LLVMBuilderRef builder = ctx->ac.builder;
+ int i;
+
+ /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
+ LLVMValueRef so_vtx_count =
+ si_unpack_param(ctx, ctx->streamout_config, 16, 7);
+
+ LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
+
+ /* can_emit = tid < so_vtx_count; */
+ LLVMValueRef can_emit =
+ LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
+
+ /* Emit the streamout code conditionally. This actually avoids
+ * out-of-bounds buffer access. The hw tells us via the SGPR
+ * (so_vtx_count) which threads are allowed to emit streamout data. */
+ ac_build_ifcc(&ctx->ac, can_emit, 6501);
+ {
+ /* The buffer offset is computed as follows:
+ * ByteOffset = streamout_offset[buffer_id]*4 +
+ * (streamout_write_index + thread_id)*stride[buffer_id] +
+ * attrib_offset
+ */
+
+ LLVMValueRef so_write_index =
+ ac_get_arg(&ctx->ac,
+ ctx->streamout_write_index);
+
+ /* Compute (streamout_write_index + thread_id). */
+ so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
+
+ /* Load the descriptor and compute the write offset for each
+ * enabled buffer. */
+ LLVMValueRef so_write_offset[4] = {};
+ LLVMValueRef so_buffers[4];
+ LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
+ ctx->rw_buffers);
+
+ for (i = 0; i < 4; i++) {
+ if (!so->stride[i])
+ continue;
+
+ LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
+ SI_VS_STREAMOUT_BUF0 + i, 0);
+
+ so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
+
+ LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
+ ctx->streamout_offset[i]);
+ so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+
+ so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
+ LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
+ so_offset);
+ }
+
+ /* Write streamout data. */
+ for (i = 0; i < so->num_outputs; i++) {
+ unsigned reg = so->output[i].register_index;
+
+ if (reg >= noutput)
+ continue;
+
+ if (stream != so->output[i].stream)
+ continue;
+
+ si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset,
+ &so->output[i], &outputs[reg]);
+ }
+ }
+ ac_build_endif(&ctx->ac, 6501);
+}
+
+static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
+ struct ac_export_args *pos, LLVMValueRef *out_elts)
+{
+ unsigned reg_index;
+ unsigned chan;
+ unsigned const_chan;
+ LLVMValueRef base_elt;
+ LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
+ LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
+ SI_VS_CONST_CLIP_PLANES, 0);
+ LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
+
+ for (reg_index = 0; reg_index < 2; reg_index ++) {
+ struct ac_export_args *args = &pos[2 + reg_index];
+
+ args->out[0] =
+ args->out[1] =
+ args->out[2] =
+ args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
+
+ /* Compute dot products of position and user clip plane vectors */
+ for (chan = 0; chan < 4; chan++) {
+ for (const_chan = 0; const_chan < 4; const_chan++) {
+ LLVMValueRef addr =
+ LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
+ const_chan) * 4, 0);
+ base_elt = si_buffer_load_const(ctx, const_resource,
+ addr);
+ args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
+ out_elts[const_chan], args->out[chan]);
+ }
+ }
+
+ args->enabled_channels = 0xf;
+ args->valid_mask = 0;
+ args->done = 0;
+ args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
+ args->compr = 0;
+ }
+}
+
+/* Initialize arguments for the shader export intrinsic */
+static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
+ LLVMValueRef *values,
+ unsigned target,
+ struct ac_export_args *args)
+{
+ args->enabled_channels = 0xf; /* writemask - default is 0xf */
+ args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
+ args->done = 0; /* Specify whether this is the last export */
+ args->target = target; /* Specify the target we are exporting */
+ args->compr = false;
+
+ memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+}
+
+static void si_export_param(struct si_shader_context *ctx, unsigned index,
+ LLVMValueRef *values)
+{
+ struct ac_export_args args;
+
+ si_llvm_init_vs_export_args(ctx, values,
+ V_008DFC_SQ_EXP_PARAM + index, &args);
+ ac_build_export(&ctx->ac, &args);
+}
+
+static void si_build_param_exports(struct si_shader_context *ctx,
+ struct si_shader_output_values *outputs,
+ unsigned noutput)
+{
+ struct si_shader *shader = ctx->shader;
+ unsigned param_count = 0;
+
+ for (unsigned i = 0; i < noutput; i++) {
+ unsigned semantic_name = outputs[i].semantic_name;
+ unsigned semantic_index = outputs[i].semantic_index;
+
+ if (outputs[i].vertex_stream[0] != 0 &&
+ outputs[i].vertex_stream[1] != 0 &&
+ outputs[i].vertex_stream[2] != 0 &&
+ outputs[i].vertex_stream[3] != 0)
+ continue;
+
+ switch (semantic_name) {
+ case TGSI_SEMANTIC_LAYER:
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ case TGSI_SEMANTIC_CLIPDIST:
+ case TGSI_SEMANTIC_COLOR:
+ case TGSI_SEMANTIC_BCOLOR:
+ case TGSI_SEMANTIC_PRIMID:
+ case TGSI_SEMANTIC_FOG:
+ case TGSI_SEMANTIC_TEXCOORD:
+ case TGSI_SEMANTIC_GENERIC:
+ break;
+ default:
+ continue;
+ }
+
+ if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
+ semantic_index < SI_MAX_IO_GENERIC) &&
+ shader->key.opt.kill_outputs &
+ (1ull << si_shader_io_get_unique_index(semantic_name,
+ semantic_index, true)))
+ continue;
+
+ si_export_param(ctx, param_count, outputs[i].values);
+
+ assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+ shader->info.vs_output_param_offset[i] = param_count++;
+ }
+
+ shader->info.nr_param_exports = param_count;
+}
+
+/**
+ * Vertex color clamping.
+ *
+ * This uses a state constant loaded in a user data SGPR and
+ * an IF statement is added that clamps all colors if the constant
+ * is true.
+ */
+static void si_vertex_color_clamping(struct si_shader_context *ctx,
+ struct si_shader_output_values *outputs,
+ unsigned noutput)
+{
+ LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
+ bool has_colors = false;
+
+ /* Store original colors to alloca variables. */
+ for (unsigned i = 0; i < noutput; i++) {
+ if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+ outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
+ LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
+ }
+ has_colors = true;
+ }
+
+ if (!has_colors)
+ return;
+
+ /* The state is in the first bit of the user SGPR. */
+ LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
+ cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
+
+ ac_build_ifcc(&ctx->ac, cond, 6502);
+
+ /* Store clamped colors to alloca variables within the conditional block. */
+ for (unsigned i = 0; i < noutput; i++) {
+ if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+ outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ LLVMBuildStore(ctx->ac.builder,
+ ac_build_clamp(&ctx->ac, outputs[i].values[j]),
+ addr[i][j]);
+ }
+ }
+ ac_build_endif(&ctx->ac, 6502);
+
+ /* Load clamped colors */
+ for (unsigned i = 0; i < noutput; i++) {
+ if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
+ outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
+ continue;
+
+ for (unsigned j = 0; j < 4; j++) {
+ outputs[i].values[j] =
+ LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
+ }
+ }
+}
+
+/* Generate export instructions for hardware VS shader stage or NGG GS stage
+ * (position and parameter data only).
+ */
+void si_llvm_build_vs_exports(struct si_shader_context *ctx,
+ struct si_shader_output_values *outputs,
+ unsigned noutput)
+{
+ struct si_shader *shader = ctx->shader;
+ struct ac_export_args pos_args[4] = {};
+ LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
+ unsigned pos_idx;
+ int i;
+
+ si_vertex_color_clamping(ctx, outputs, noutput);
+
+ /* Build position exports. */
+ for (i = 0; i < noutput; i++) {
+ switch (outputs[i].semantic_name) {
+ case TGSI_SEMANTIC_POSITION:
+ si_llvm_init_vs_export_args(ctx, outputs[i].values,
+ V_008DFC_SQ_EXP_POS, &pos_args[0]);
+ break;
+ case TGSI_SEMANTIC_PSIZE:
+ psize_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_LAYER:
+ layer_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ viewport_index_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_EDGEFLAG:
+ edgeflag_value = outputs[i].values[0];
+ break;
+ case TGSI_SEMANTIC_CLIPDIST:
+ if (!shader->key.opt.clip_disable) {
+ unsigned index = 2 + outputs[i].semantic_index;
+ si_llvm_init_vs_export_args(ctx, outputs[i].values,
+ V_008DFC_SQ_EXP_POS + index,
+ &pos_args[index]);
+ }
+ break;
+ case TGSI_SEMANTIC_CLIPVERTEX:
+ if (!shader->key.opt.clip_disable) {
+ si_llvm_emit_clipvertex(ctx, pos_args,
+ outputs[i].values);
+ }
+ break;
+ }
+ }
+
+ /* We need to add the position output manually if it's missing. */
+ if (!pos_args[0].out[0]) {
+ pos_args[0].enabled_channels = 0xf; /* writemask */
+ pos_args[0].valid_mask = 0; /* EXEC mask */
+ pos_args[0].done = 0; /* last export? */
+ pos_args[0].target = V_008DFC_SQ_EXP_POS;
+ pos_args[0].compr = 0; /* COMPR flag */
+ pos_args[0].out[0] = ctx->ac.f32_0; /* X */
+ pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
+ pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
+ pos_args[0].out[3] = ctx->ac.f32_1; /* W */
+ }
+
+ bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
+ !shader->key.as_ngg;
+
+ /* Write the misc vector (point size, edgeflag, layer, viewport). */
+ if (shader->selector->info.writes_psize ||
+ pos_writes_edgeflag ||
+ shader->selector->info.writes_viewport_index ||
+ shader->selector->info.writes_layer) {
+ pos_args[1].enabled_channels = shader->selector->info.writes_psize |
+ (pos_writes_edgeflag << 1) |
+ (shader->selector->info.writes_layer << 2);
+
+ pos_args[1].valid_mask = 0; /* EXEC mask */
+ pos_args[1].done = 0; /* last export? */
+ pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
+ pos_args[1].compr = 0; /* COMPR flag */
+ pos_args[1].out[0] = ctx->ac.f32_0; /* X */
+ pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
+ pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
+ pos_args[1].out[3] = ctx->ac.f32_0; /* W */
+
+ if (shader->selector->info.writes_psize)
+ pos_args[1].out[0] = psize_value;
+
+ if (pos_writes_edgeflag) {
+ /* The output is a float, but the hw expects an integer
+ * with the first bit containing the edge flag. */
+ edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
+ edgeflag_value,
+ ctx->ac.i32, "");
+ edgeflag_value = ac_build_umin(&ctx->ac,
+ edgeflag_value,
+ ctx->ac.i32_1);
+
+ /* The LLVM intrinsic expects a float. */
+ pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
+ }
+
+ if (ctx->screen->info.chip_class >= GFX9) {
+ /* GFX9 has the layer in out.z[10:0] and the viewport
+ * index in out.z[19:16].
+ */
+ if (shader->selector->info.writes_layer)
+ pos_args[1].out[2] = layer_value;
+
+ if (shader->selector->info.writes_viewport_index) {
+ LLVMValueRef v = viewport_index_value;
+
+ v = ac_to_integer(&ctx->ac, v);
+ v = LLVMBuildShl(ctx->ac.builder, v,
+ LLVMConstInt(ctx->ac.i32, 16, 0), "");
+ v = LLVMBuildOr(ctx->ac.builder, v,
+ ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
+ pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
+ pos_args[1].enabled_channels |= 1 << 2;
+ }
+ } else {
+ if (shader->selector->info.writes_layer)
+ pos_args[1].out[2] = layer_value;
+
+ if (shader->selector->info.writes_viewport_index) {
+ pos_args[1].out[3] = viewport_index_value;
+ pos_args[1].enabled_channels |= 1 << 3;
+ }
+ }
+ }
+
+ for (i = 0; i < 4; i++)
+ if (pos_args[i].out[0])
+ shader->info.nr_pos_exports++;
+
+ /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+ * Setting valid_mask=1 prevents it and has no other effect.
+ */
+ if (ctx->screen->info.family == CHIP_NAVI10 ||
+ ctx->screen->info.family == CHIP_NAVI12 ||
+ ctx->screen->info.family == CHIP_NAVI14)
+ pos_args[0].valid_mask = 1;
+
+ pos_idx = 0;
+ for (i = 0; i < 4; i++) {
+ if (!pos_args[i].out[0])
+ continue;
+
+ /* Specify the target we are exporting */
+ pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
+
+ if (pos_idx == shader->info.nr_pos_exports)
+ /* Specify that this is the last export */
+ pos_args[i].done = 1;
+
+ ac_build_export(&ctx->ac, &pos_args[i]);
+ }
+
+ /* Build parameter exports. */
+ si_build_param_exports(ctx, outputs, noutput);
+}
+
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
+ LLVMValueRef *addrs)
+{
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ struct si_shader_output_values *outputs = NULL;
+ int i,j;
+
+ assert(!ctx->shader->is_gs_copy_shader);
+ assert(info->num_outputs <= max_outputs);
+
+ outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
+
+ for (i = 0; i < info->num_outputs; i++) {
+ outputs[i].semantic_name = info->output_semantic_name[i];
+ outputs[i].semantic_index = info->output_semantic_index[i];
+
+ for (j = 0; j < 4; j++) {
+ outputs[i].values[j] =
+ LLVMBuildLoad(ctx->ac.builder,
+ addrs[4 * i + j],
+ "");
+ outputs[i].vertex_stream[j] =
+ (info->output_streams[i] >> (2 * j)) & 3;
+ }
+ }
+
+ if (!ctx->screen->use_ngg_streamout &&
+ ctx->shader->selector->so.num_outputs)
+ si_llvm_emit_streamout(ctx, outputs, i, 0);
+
+ /* Export PrimitiveID. */
+ if (ctx->shader->key.mono.u.vs_export_prim_id) {
+ outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+ outputs[i].semantic_index = 0;
+ outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
+ for (j = 1; j < 4; j++)
+ outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
+
+ memset(outputs[i].vertex_stream, 0,
+ sizeof(outputs[i].vertex_stream));
+ i++;
+ }
+
+ si_llvm_build_vs_exports(ctx, outputs, i);
+ FREE(outputs);
+}
+
+static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
+ unsigned max_outputs,
+ LLVMValueRef *addrs)
+{
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+ struct si_shader_info *info = &ctx->shader->selector->info;
+ LLVMValueRef pos[4] = {};
+
+ assert(info->num_outputs <= max_outputs);
+
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
+ continue;
+
+ for (unsigned chan = 0; chan < 4; chan++)
+ pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+ break;
+ }
+ assert(pos[0] != NULL);
+
+ /* Return the position output. */
+ LLVMValueRef ret = ctx->return_value;
+ for (unsigned chan = 0; chan < 4; chan++)
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
+ ctx->return_value = ret;
+}
+
+/**
+ * Build the vertex shader prolog function.
+ *
+ * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
+ * All inputs are returned unmodified. The vertex load indices are
+ * stored after them, which will be used by the API VS for fetching inputs.
+ *
+ * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
+ * input_v0,
+ * input_v1,
+ * input_v2,
+ * input_v3,
+ * (VertexID + BaseVertex),
+ * (InstanceID + StartInstance),
+ * (InstanceID / 2 + StartInstance)
+ */
+void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
+ union si_shader_part_key *key)
+{
+ LLVMTypeRef *returns;
+ LLVMValueRef ret, func;
+ int num_returns, i;
+ unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
+ unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+ struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
+ struct ac_arg input_vgpr_param[9];
+ LLVMValueRef input_vgprs[9];
+ unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
+ num_input_vgprs;
+ unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
+
+ memset(&ctx->args, 0, sizeof(ctx->args));
+
+ /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+ returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
+ sizeof(LLVMTypeRef));
+ num_returns = 0;
+
+ /* Declare input and output SGPRs. */
+ for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
+ &input_sgpr_param[i]);
+ returns[num_returns++] = ctx->ac.i32;
+ }
+
+ struct ac_arg merged_wave_info = input_sgpr_param[3];
+
+ /* Preloaded VGPRs (outputs must be floats) */
+ for (i = 0; i < num_input_vgprs; i++) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
+ returns[num_returns++] = ctx->ac.f32;
+ }
+
+ /* Vertex load indices. */
+ for (i = 0; i < key->vs_prolog.num_inputs; i++)
+ returns[num_returns++] = ctx->ac.f32;
+
+ /* Create the function. */
+ si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
+ func = ctx->main_fn;
+
+ for (i = 0; i < num_input_vgprs; i++) {
+ input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
+ }
+
+ if (key->vs_prolog.num_merged_next_stage_vgprs) {
+ if (!key->vs_prolog.is_monolithic)
+ si_init_exec_from_input(ctx, merged_wave_info, 0);
+
+ if (key->vs_prolog.as_ls &&
+ ctx->screen->info.has_ls_vgpr_init_bug) {
+ /* If there are no HS threads, SPI loads the LS VGPRs
+ * starting at VGPR 0. Shift them back to where they
+ * belong.
+ */
+ LLVMValueRef has_hs_threads =
+ LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
+ si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
+ ctx->ac.i32_0, "");
+
+ for (i = 4; i > 0; --i) {
+ input_vgprs[i + 1] =
+ LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
+ input_vgprs[i + 1],
+ input_vgprs[i - 1], "");
+ }
+ }
+ }
+
+ if (key->vs_prolog.gs_fast_launch_tri_list ||
+ key->vs_prolog.gs_fast_launch_tri_strip) {
+ LLVMValueRef wave_id, thread_id_in_tg;
+
+ wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+ thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+ ac_get_thread_id(&ctx->ac));
+
+ /* The GS fast launch initializes all VGPRs to the value of
+ * the first thread, so we have to add the thread ID.
+ *
+ * Only these are initialized by the hw:
+ * VGPR2: Base Primitive ID
+ * VGPR5: Base Vertex ID
+ * VGPR6: Instance ID
+ */
+
+ /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+ * The NGG cull shader will read them from there.
+ */
+ if (key->vs_prolog.gs_fast_launch_tri_list) {
+ input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
+ LLVMConstInt(ctx->ac.i32, 0, 0));
+ input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
+ LLVMConstInt(ctx->ac.i32, 1, 0));
+ input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
+ LLVMConstInt(ctx->ac.i32, 2, 0));
+ } else {
+ assert(key->vs_prolog.gs_fast_launch_tri_strip);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ /* Triangle indices: */
+ LLVMValueRef index[3] = {
+ thread_id_in_tg,
+ LLVMBuildAdd(builder, thread_id_in_tg,
+ LLVMConstInt(ctx->ac.i32, 1, 0), ""),
+ LLVMBuildAdd(builder, thread_id_in_tg,
+ LLVMConstInt(ctx->ac.i32, 2, 0), ""),
+ };
+ LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
+ thread_id_in_tg, ctx->ac.i1, "");
+ LLVMValueRef flatshade_first =
+ LLVMBuildICmp(builder, LLVMIntEQ,
+ si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
+ ctx->ac.i32_0, "");
+
+ ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
+ flatshade_first, index);
+ input_vgprs[0] = index[0];
+ input_vgprs[1] = index[1];
+ input_vgprs[4] = index[2];
+ }
+
+ /* Triangles always have all edge flags set initially. */
+ input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
+
+ input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
+ thread_id_in_tg, ""); /* PrimID */
+ input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
+ thread_id_in_tg, ""); /* VertexID */
+ input_vgprs[8] = input_vgprs[6]; /* InstanceID */
+ }
+
+ unsigned vertex_id_vgpr = first_vs_vgpr;
+ unsigned instance_id_vgpr =
+ ctx->screen->info.chip_class >= GFX10 ?
+ first_vs_vgpr + 3 :
+ first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
+
+ ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
+ ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
+
+ /* InstanceID = VertexID >> 16;
+ * VertexID = VertexID & 0xffff;
+ */
+ if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
+ ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
+ LLVMConstInt(ctx->ac.i32, 16, 0), "");
+ ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
+ LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
+ }
+
+ /* Copy inputs to outputs. This should be no-op, as the registers match,
+ * but it will prevent the compiler from overwriting them unintentionally.
+ */
+ ret = ctx->return_value;
+ for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+ LLVMValueRef p = LLVMGetParam(func, i);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
+ }
+ for (i = 0; i < num_input_vgprs; i++) {
+ LLVMValueRef p = input_vgprs[i];
+
+ if (i == vertex_id_vgpr)
+ p = ctx->abi.vertex_id;
+ else if (i == instance_id_vgpr)
+ p = ctx->abi.instance_id;
+
+ p = ac_to_float(&ctx->ac, p);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
+ key->vs_prolog.num_input_sgprs + i, "");
+ }
+
+ /* Compute vertex load indices from instance divisors. */
+ LLVMValueRef instance_divisor_constbuf = NULL;
+
+ if (key->vs_prolog.states.instance_divisor_is_fetched) {
+ LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+ LLVMValueRef buf_index =
+ LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+ instance_divisor_constbuf =
+ ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
+ }
+
+ for (i = 0; i < key->vs_prolog.num_inputs; i++) {
+ bool divisor_is_one =
+ key->vs_prolog.states.instance_divisor_is_one & (1u << i);
+ bool divisor_is_fetched =
+ key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
+ LLVMValueRef index = NULL;
+
+ if (divisor_is_one) {
+ index = ctx->abi.instance_id;
+ } else if (divisor_is_fetched) {
+ LLVMValueRef udiv_factors[4];
+
+ for (unsigned j = 0; j < 4; j++) {
+ udiv_factors[j] =
+ si_buffer_load_const(ctx, instance_divisor_constbuf,
+ LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
+ udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
+ }
+ /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+ * Such InstanceID might not be achievable in a reasonable time though.
+ */
+ index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
+ udiv_factors[0], udiv_factors[1],
+ udiv_factors[2], udiv_factors[3]);
+ }
+
+ if (divisor_is_one || divisor_is_fetched) {
+ /* Add StartInstance. */
+ index = LLVMBuildAdd(ctx->ac.builder, index,
+ LLVMGetParam(ctx->main_fn, user_sgpr_base +
+ SI_SGPR_START_INSTANCE), "");
+ } else {
+ /* VertexID + BaseVertex */
+ index = LLVMBuildAdd(ctx->ac.builder,
+ ctx->abi.vertex_id,
+ LLVMGetParam(func, user_sgpr_base +
+ SI_SGPR_BASE_VERTEX), "");
+ }
+
+ index = ac_to_float(&ctx->ac, index);
+ ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
+ ctx->args.arg_count + i, "");
+ }
+
+ si_llvm_build_ret(ctx, ret);
+}
+
+static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
+{
+ struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+
+ /* For non-indexed draws, the base vertex set by the driver
+ * (for direct draws) or the CP (for indirect draws) is the
+ * first vertex ID, but GLSL expects 0 to be returned.
+ */
+ LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
+ ctx->vs_state_bits);
+ LLVMValueRef indexed;
+
+ indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
+ indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
+
+ return LLVMBuildSelect(ctx->ac.builder, indexed,
+ ac_get_arg(&ctx->ac, ctx->args.base_vertex),
+ ctx->ac.i32_0, "");
+}
+
+void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
+{
+ struct si_shader *shader = ctx->shader;
+
+ if (shader->key.as_ls)
+ ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
+ else if (shader->key.as_es)
+ ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+ else if (shader->key.opt.vs_as_prim_discard_cs)
+ ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
+ else if (shader->key.as_ngg)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
+ else
+ ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+
+ ctx->abi.load_base_vertex = get_base_vertex;
+}
si_lower_nir(sscreen, nir);
}
-static void declare_nir_input_vs(struct si_shader_context *ctx,
- struct nir_variable *variable,
- unsigned input_index,
- LLVMValueRef out[4])
-{
- si_llvm_load_input_vs(ctx, input_index, out);
-}
-
-static void bitcast_inputs(struct si_shader_context *ctx,
- LLVMValueRef data[4],
- unsigned input_idx)
-{
- for (unsigned chan = 0; chan < 4; chan++) {
- ctx->inputs[input_idx + chan] =
- LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, "");
- }
-}
-
bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
{
- struct si_shader_info *info = &ctx->shader->selector->info;
-
if (nir->info.stage == MESA_SHADER_VERTEX) {
- uint64_t processed_inputs = 0;
- nir_foreach_variable(variable, &nir->inputs) {
- unsigned attrib_count = glsl_count_attribute_slots(variable->type,
- true);
- unsigned input_idx = variable->data.driver_location;
-
- LLVMValueRef data[4];
- unsigned loc = variable->data.location;
-
- for (unsigned i = 0; i < attrib_count; i++) {
- /* Packed components share the same location so skip
- * them if we have already processed the location.
- */
- if (processed_inputs & ((uint64_t)1 << (loc + i))) {
- input_idx += 4;
- continue;
- }
-
- declare_nir_input_vs(ctx, variable, input_idx / 4, data);
- bitcast_inputs(ctx, data, input_idx);
- if (glsl_type_is_dual_slot(variable->type)) {
- input_idx += 4;
- declare_nir_input_vs(ctx, variable, input_idx / 4, data);
- bitcast_inputs(ctx, data, input_idx);
- }
-
- processed_inputs |= ((uint64_t)1 << (loc + i));
- input_idx += 4;
- }
- }
+ si_llvm_load_vs_inputs(ctx, nir);
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
unsigned colors_read =
ctx->shader->selector->info.colors_read;