* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <llvm/Config/llvm-config.h>
-
#include "util/u_memory.h"
#include "tgsi/tgsi_strings.h"
#include "tgsi/tgsi_from_mesa.h"
#include "ac_exp_param.h"
-#include "ac_shader_util.h"
#include "ac_rtld.h"
-#include "ac_llvm_util.h"
#include "si_shader_internal.h"
#include "si_pipe.h"
#include "sid.h"
if (rshift)
value = LLVMBuildLShr(ctx->ac.builder, value,
- LLVMConstInt(ctx->i32, rshift, 0), "");
+ LLVMConstInt(ctx->ac.i32, rshift, 0), "");
if (rshift + bitwidth < 32) {
unsigned mask = (1 << bitwidth) - 1;
value = LLVMBuildAnd(ctx->ac.builder, value,
- LLVMConstInt(ctx->i32, mask, 0), "");
+ LLVMConstInt(ctx->ac.i32, mask, 0), "");
}
return value;
if (index == 1)
return LLVMBuildAShr(ctx->ac.builder, i32,
- LLVMConstInt(ctx->i32, 16, 0), "");
+ LLVMConstInt(ctx->ac.i32, 16, 0), "");
return LLVMBuildSExt(ctx->ac.builder,
LLVMBuildTrunc(ctx->ac.builder, i32,
ctx->ac.i16, ""),
- ctx->i32, "");
+ ctx->ac.i32, "");
}
void si_llvm_load_input_vs(
LLVMValueRef vertex_id = ctx->abi.vertex_id;
LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
LLVMIntULE, vertex_id,
- ctx->i32_1, "");
+ ctx->ac.i32_1, "");
/* Use LLVMIntNE, because we have 3 vertices and only
* the middle one should use y2.
*/
LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
LLVMIntNE, vertex_id,
- ctx->i32_1, "");
+ ctx->ac.i32_1, "");
unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
if (input_index == 0) {
LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
y1, y2, "");
- out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
- out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
+ out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
+ out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
out[2] = LLVMGetParam(ctx->main_fn,
param_vs_blit_inputs + 2);
out[3] = ctx->ac.f32_1;
unsigned index= input_index - num_vbos_in_user_sgprs;
vb_desc = ac_build_load_to_sgpr(&ctx->ac,
ac_get_arg(&ctx->ac, ctx->vertex_buffers),
- LLVMConstInt(ctx->i32, index, 0));
+ LLVMConstInt(ctx->ac.i32, index, 0));
}
vertex_index = LLVMGetParam(ctx->main_fn,
fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
for (unsigned i = 0; i < 4; ++i)
- out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
+ out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
return;
}
}
for (unsigned i = 0; i < num_fetches; ++i) {
- LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
+ LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
channels_per_fetch, 0, true);
}
if (num_fetches == 1 && channels_per_fetch > 1) {
LLVMValueRef fetch = fetches[0];
for (unsigned i = 0; i < channels_per_fetch; ++i) {
- tmp = LLVMConstInt(ctx->i32, i, false);
+ tmp = LLVMConstInt(ctx->ac.i32, i, false);
fetches[i] = LLVMBuildExtractElement(
ctx->ac.builder, fetch, tmp, "");
}
}
for (unsigned i = num_fetches; i < 4; ++i)
- fetches[i] = LLVMGetUndef(ctx->f32);
+ fetches[i] = LLVMGetUndef(ctx->ac.f32);
if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
required_channels == 4) {
* convert it to a signed one.
*/
LLVMValueRef tmp = fetches[3];
- LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
+ LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
/* First, recover the sign-extended signed integer value. */
if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
- tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
+ tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
else
tmp = ac_to_integer(&ctx->ac, tmp);
*/
tmp = LLVMBuildShl(ctx->ac.builder, tmp,
fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
- LLVMConstInt(ctx->i32, 7, 0) : c30, "");
+ LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
/* Convert back to the right type. */
if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
LLVMValueRef clamp;
- LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
- tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
+ LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
+ tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
} else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
- tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
+ tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
}
fetches[3] = tmp;
unsigned swizzle)
{
if (swizzle > 0)
- return ctx->i32_0;
+ return ctx->ac.i32_0;
switch (ctx->type) {
case PIPE_SHADER_VERTEX:
return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
default:
assert(0);
- return ctx->i32_0;
+ return ctx->ac.i32_0;
}
}
ctx->vs_state_bits);
LLVMValueRef indexed;
- indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
- indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
+ indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
+ indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
return LLVMBuildSelect(ctx->ac.builder, indexed,
ac_get_arg(&ctx->ac, ctx->args.base_vertex),
- ctx->i32_0, "");
+ ctx->ac.i32_0, "");
}
static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
};
for (i = 0; i < 3; ++i)
- values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
+ values[i] = LLVMConstInt(ctx->ac.i32, sizes[i], 0);
result = ac_build_gather_values(&ctx->ac, values, 3);
} else {
struct si_shader_selector *sel = ctx->shader->selector;
unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
- LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
+ LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
LLVMValueRef var;
assert(!ctx->ac.lds);
var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- LLVMArrayType(ctx->i8, lds_size),
+ LLVMArrayType(ctx->ac.i8, lds_size),
"compute_lds",
AC_ADDR_SPACE_LDS);
LLVMSetAlignment(var, 64 * 1024);
unsigned const_chan;
LLVMValueRef base_elt;
LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
- LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
+ LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
SI_VS_CONST_CLIP_PLANES, 0);
LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
args->out[0] =
args->out[1] =
args->out[2] =
- args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
+ args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
/* Compute dot products of position and user clip plane vectors */
for (chan = 0; chan < 4; chan++) {
for (const_chan = 0; const_chan < 4; const_chan++) {
LLVMValueRef addr =
- LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
+ LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
const_chan) * 4, 0);
base_elt = si_buffer_load_const(ctx, const_resource,
addr);
break;
}
/* as v4i32 (aligned to 4) */
- out[3] = LLVMGetUndef(ctx->i32);
+ out[3] = LLVMGetUndef(ctx->ac.i32);
/* fall through */
case 4: /* as v4i32 */
vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
vdata, num_comps,
so_write_offsets[buf_idx],
- ctx->i32_0,
+ ctx->ac.i32_0,
stream_out->dst_offset * 4, ac_glc | ac_slc);
}
if (!so->stride[i])
continue;
- LLVMValueRef offset = LLVMConstInt(ctx->i32,
+ LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
SI_VS_STREAMOUT_BUF0 + i, 0);
so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
ctx->streamout_offset[i]);
- so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
+ so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
- LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
+ LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
so_offset);
}
continue;
for (unsigned j = 0; j < 4; j++) {
- addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->f32, "");
+ addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
}
has_colors = true;
/* The state is in the first bit of the user SGPR. */
LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
- cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->i1, "");
+ cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
ac_build_ifcc(&ctx->ac, cond, 6502);
* with the first bit containing the edge flag. */
edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
edgeflag_value,
- ctx->i32, "");
+ ctx->ac.i32, "");
edgeflag_value = ac_build_umin(&ctx->ac,
edgeflag_value,
- ctx->i32_1);
+ ctx->ac.i32_1);
/* The LLVM intrinsic expects a float. */
pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
v = ac_to_integer(&ctx->ac, v);
v = LLVMBuildShl(ctx->ac.builder, v,
- LLVMConstInt(ctx->i32, 16, 0), "");
+ LLVMConstInt(ctx->ac.i32, 16, 0), "");
v = LLVMBuildOr(ctx->ac.builder, v,
ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
outputs[i].semantic_index = 0;
outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
for (j = 1; j < 4; j++)
- outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
+ outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
memset(outputs[i].vertex_stream, 0,
sizeof(outputs[i].vertex_stream));
}
static void declare_vs_input_vgprs(struct si_shader_context *ctx,
- unsigned *num_prolog_vgprs)
+ unsigned *num_prolog_vgprs,
+ bool ngg_cull_shader)
{
struct si_shader *shader = ctx->shader;
}
if (!shader->is_gs_copy_shader) {
+ if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+ &ctx->ngg_old_thread_id);
+ }
+
/* Vertex load indices. */
if (shader->selector->info.num_inputs) {
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
}
}
-static void declare_tes_input_vgprs(struct si_shader_context *ctx)
+static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
{
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
+
+ if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
+ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
+ &ctx->ngg_old_thread_id);
+ }
}
enum {
ac_add_arg(args, file, registers, type, arg);
}
-void si_create_function(struct si_shader_context *ctx)
+void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
{
struct si_shader *shader = ctx->shader;
LLVMTypeRef returns[AC_MAX_ARGS];
declare_vs_blit_inputs(ctx, vs_blit_property);
/* VGPRs */
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
break;
}
}
/* VGPRs */
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
/* Return values */
if (shader->key.opt.vs_as_prim_discard_cs) {
for (i = 0; i < 4; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
}
break;
* placed after the user SGPRs.
*/
for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
for (i = 0; i < 11; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
break;
case SI_SHADER_MERGED_VERTEX_TESSCTRL:
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
/* LS return values are inputs to the TCS main shader part. */
for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
for (i = 0; i < 2; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
} else {
/* TCS return values are inputs to the TCS epilog.
*
* should be passed to the epilog.
*/
for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
for (i = 0; i < 11; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
}
break;
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
+ &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
declare_global_desc_pointers(ctx);
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
if (ctx->type == PIPE_SHADER_VERTEX) {
- declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
+ declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
} else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
- declare_tes_input_vgprs(ctx);
+ declare_tes_input_vgprs(ctx, ngg_cull_shader);
}
- if (ctx->shader->key.as_es &&
+ if ((ctx->shader->key.as_es || ngg_cull_shader) &&
(ctx->type == PIPE_SHADER_VERTEX ||
ctx->type == PIPE_SHADER_TESS_EVAL)) {
- unsigned num_user_sgprs;
+ unsigned num_user_sgprs, num_vgprs;
- if (ctx->type == PIPE_SHADER_VERTEX)
- num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
- else
+ if (ctx->type == PIPE_SHADER_VERTEX) {
+ /* For the NGG cull shader, add 1 SGPR to hold
+ * the vertex buffer pointer.
+ */
+ num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
+
+ if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
+ assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
+ num_user_sgprs = SI_SGPR_VS_VB_DESCRIPTOR_FIRST +
+ shader->selector->num_vbos_in_user_sgprs * 4;
+ }
+ } else {
num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+ }
+
+ /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+ *
+ * The normal merged ESGS shader only has to return the 5 VGPRs
+ * for the GS stage.
+ */
+ num_vgprs = ngg_cull_shader ? 10 : 5;
/* ES return values are inputs to GS. */
for (i = 0; i < 8 + num_user_sgprs; i++)
- returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 5; i++)
- returns[num_returns++] = ctx->f32; /* VGPRs */
+ returns[num_returns++] = ctx->ac.i32; /* SGPRs */
+ for (i = 0; i < num_vgprs; i++)
+ returns[num_returns++] = ctx->ac.f32; /* VGPRs */
}
break;
}
/* VGPRs */
- declare_tes_input_vgprs(ctx);
+ declare_tes_input_vgprs(ctx, ngg_cull_shader);
break;
case PIPE_SHADER_GEOMETRY:
PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
for (i = 0; i < num_return_sgprs; i++)
- returns[i] = ctx->i32;
+ returns[i] = ctx->ac.i32;
for (; i < num_returns; i++)
- returns[i] = ctx->f32;
+ returns[i] = ctx->ac.f32;
break;
case PIPE_SHADER_COMPUTE:
return;
}
- si_llvm_create_func(ctx, "main", returns, num_returns,
- si_get_max_workgroup_size(shader));
+ si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main",
+ returns, num_returns, si_get_max_workgroup_size(shader));
/* Reserve register locations for VGPR inputs the PS prolog may need. */
if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
* own LDS-based lowering).
*/
ctx->ac.lds = LLVMAddGlobalInAddressSpace(
- ctx->ac.module, LLVMArrayType(ctx->i32, 0),
+ ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
"__lds_end", AC_ADDR_SPACE_LDS);
LLVMSetAlignment(ctx->ac.lds, 256);
} else {
si_shader_dump_stats(sscreen, shader, file, check_debug_option);
}
-int si_compile_llvm(struct si_screen *sscreen,
- struct si_shader_binary *binary,
- struct ac_shader_config *conf,
- struct ac_llvm_compiler *compiler,
- LLVMModuleRef mod,
- struct pipe_debug_callback *debug,
- enum pipe_shader_type shader_type,
- unsigned wave_size,
- const char *name,
- bool less_optimized)
-{
- unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
-
- if (si_can_dump_shader(sscreen, shader_type)) {
- fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
-
- if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
- fprintf(stderr, "%s LLVM IR:\n\n", name);
- ac_dump_module(mod);
- fprintf(stderr, "\n");
- }
- }
-
- if (sscreen->record_llvm_ir) {
- char *ir = LLVMPrintModuleToString(mod);
- binary->llvm_ir_string = strdup(ir);
- LLVMDisposeMessage(ir);
- }
-
- if (!si_replace_shader(count, binary)) {
- unsigned r = si_llvm_compile(mod, binary, compiler, debug,
- less_optimized, wave_size);
- if (r)
- return r;
- }
-
- struct ac_rtld_binary rtld;
- if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
- .info = &sscreen->info,
- .shader_type = tgsi_processor_to_shader_stage(shader_type),
- .wave_size = wave_size,
- .num_parts = 1,
- .elf_ptrs = &binary->elf_buffer,
- .elf_sizes = &binary->elf_size }))
- return -1;
-
- bool ok = ac_rtld_read_config(&rtld, conf);
- ac_rtld_close(&rtld);
- if (!ok)
- return -1;
-
- /* Enable 64-bit and 16-bit denormals, because there is no performance
- * cost.
- *
- * If denormals are enabled, all floating-point output modifiers are
- * ignored.
- *
- * Don't enable denormals for 32-bit floats, because:
- * - Floating-point output modifiers would be ignored by the hw.
- * - Some opcodes don't support denormals, such as v_mad_f32. We would
- * have to stop using those.
- * - GFX6 & GFX7 would be very slow.
- */
- conf->float_mode |= V_00B028_FP_64_DENORMS;
-
- return 0;
-}
-
static void si_dump_shader_key_vs(const struct si_shader_key *key,
const struct si_vs_prolog_bits *prolog,
const char *prefix, FILE *f)
!key->as_es && !key->as_ls) {
fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
+ if (shader_type != PIPE_SHADER_GEOMETRY)
+ fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
}
}
{
LLVMValueRef args[] = {
ac_get_arg(&ctx->ac, param),
- LLVMConstInt(ctx->i32, bitoffset, 0),
+ LLVMConstInt(ctx->ac.i32, bitoffset, 0),
};
ac_build_intrinsic(&ctx->ac,
"llvm.amdgcn.init.exec.from.input",
- ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
+ ctx->ac.voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
}
static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
- const struct si_vs_prolog_bits *key)
+ const struct si_vs_prolog_bits *prolog_key,
+ const struct si_shader_key *key,
+ bool ngg_cull_shader)
{
/* VGPR initialization fixup for Vega10 and Raven is always done in the
* VS prolog. */
return sel->vs_needs_prolog ||
- key->ls_vgpr_fix ||
- key->unpack_instance_id_from_vertex_id;
+ prolog_key->ls_vgpr_fix ||
+ prolog_key->unpack_instance_id_from_vertex_id ||
+ (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
}
static bool si_build_main_function(struct si_shader_context *ctx,
- struct nir_shader *nir, bool free_nir)
+ struct nir_shader *nir, bool free_nir,
+ bool ngg_cull_shader)
{
struct si_shader *shader = ctx->shader;
struct si_shader_selector *sel = shader->selector;
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
else if (shader->key.opt.vs_as_prim_discard_cs)
ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
else if (shader->key.as_ngg)
ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
else
if (shader->key.as_es)
ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+ else if (ngg_cull_shader)
+ ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
else if (shader->key.as_ngg)
ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
else
return false;
}
- si_create_function(ctx);
+ si_create_function(ctx, ngg_cull_shader);
if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
si_preload_esgs_ring(ctx);
sel->info.tessfactors_are_def_in_all_invocs) {
for (unsigned i = 0; i < 6; i++) {
ctx->invoc0_tess_factors[i] =
- ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
+ ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
}
}
if (ctx->type == PIPE_SHADER_GEOMETRY) {
for (unsigned i = 0; i < 4; i++) {
ctx->gs_next_vertex[i] =
- ac_build_alloca(&ctx->ac, ctx->i32, "");
+ ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
}
if (shader->key.as_ngg) {
for (unsigned i = 0; i < 4; ++i) {
if (sel->so.num_outputs)
scratch_size = 44;
- LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size);
+ assert(!ctx->gs_ngg_scratch);
+ LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, scratch_size);
ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
- LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
+ LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
LLVMSetAlignment(ctx->gs_ngg_emit, 4);
}
/* This is really only needed when streamout and / or vertex
* compaction is enabled.
*/
- if (sel->so.num_outputs && !ctx->gs_ngg_scratch) {
- LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8);
+ if (!ctx->gs_ngg_scratch &&
+ (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
+ LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, 8);
ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
(shader->key.as_es || shader->key.as_ls) &&
(ctx->type == PIPE_SHADER_TESS_EVAL ||
(ctx->type == PIPE_SHADER_VERTEX &&
- !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
+ !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+ &shader->key, ngg_cull_shader)))) {
si_init_exec_from_input(ctx,
ctx->merged_wave_info, 0);
} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
if (!shader->is_monolithic ||
(ctx->type == PIPE_SHADER_TESS_EVAL &&
- (shader->key.as_ngg && !shader->key.as_es)))
+ shader->key.as_ngg && !shader->key.as_es &&
+ !shader->key.opt.ngg_culling))
ac_init_exec_full_mask(&ctx->ac);
+ if ((ctx->type == PIPE_SHADER_VERTEX ||
+ ctx->type == PIPE_SHADER_TESS_EVAL) &&
+ shader->key.as_ngg && !shader->key.as_es &&
+ !shader->key.opt.ngg_culling) {
+ gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
+
+ /* Build the primitive export at the beginning
+ * of the shader if possible.
+ */
+ if (gfx10_ngg_export_prim_early(shader))
+ gfx10_ngg_build_export_prim(ctx, NULL, NULL);
+ }
+
if (ctx->type == PIPE_SHADER_TESS_CTRL ||
ctx->type == PIPE_SHADER_GEOMETRY) {
if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
}
if (sel->force_correct_derivs_after_kill) {
- ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
+ ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
/* true = don't kill. */
- LLVMBuildStore(ctx->ac.builder, ctx->i1true,
+ LLVMBuildStore(ctx->ac.builder, ctx->ac.i1true,
ctx->postponed_kill);
}
*
* \param info Shader info of the vertex shader.
* \param num_input_sgprs Number of input SGPRs for the vertex shader.
+ * \param has_old_ Whether the preceding shader part is the NGG cull shader.
* \param prolog_key Key of the VS prolog
* \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
* \param key Output shader part key.
*/
static void si_get_vs_prolog_key(const struct si_shader_info *info,
unsigned num_input_sgprs,
+ bool ngg_cull_shader,
const struct si_vs_prolog_bits *prolog_key,
struct si_shader *shader_out,
union si_shader_part_key *key)
key->vs_prolog.as_es = shader_out->key.as_es;
key->vs_prolog.as_ngg = shader_out->key.as_ngg;
+ if (ngg_cull_shader) {
+ key->vs_prolog.gs_fast_launch_tri_list = !!(shader_out->key.opt.ngg_culling &
+ SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
+ key->vs_prolog.gs_fast_launch_tri_strip = !!(shader_out->key.opt.ngg_culling &
+ SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
+ } else {
+ key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
+ }
+
if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
key->vs_prolog.as_ls = 1;
key->vs_prolog.num_merged_next_stage_vgprs = 2;
arg_type = AC_ARG_CONST_IMAGE_PTR;
else
assert(0);
- } else if (type == ctx->f32) {
+ } else if (type == ctx->ac.f32) {
arg_type = AC_ARG_CONST_FLOAT_PTR;
} else {
assert(0);
for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
LLVMTypeRef param_type = LLVMTypeOf(param);
- LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->i32 : ctx->f32;
+ LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->ac.i32 : ctx->ac.f32;
unsigned size = ac_get_type_size(param_type) / 4;
if (size == 1) {
if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
- param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
- param_type = ctx->i32;
+ param = LLVMBuildPtrToInt(builder, param, ctx->ac.i32, "");
+ param_type = ctx->ac.i32;
}
if (param_type != out_type)
LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
- param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
- param_type = ctx->i64;
+ param = LLVMBuildPtrToInt(builder, param, ctx->ac.i64, "");
+ param_type = ctx->ac.i64;
}
if (param_type != vector_type)
for (unsigned j = 0; j < size; ++j)
out[num_out++] = LLVMBuildExtractElement(
- builder, param, LLVMConstInt(ctx->i32, j, 0), "");
+ builder, param, LLVMConstInt(ctx->ac.i32, j, 0), "");
}
if (ctx->args.args[i].file == AC_ARG_SGPR)
LLVMValueRef ena, count = initial[3];
count = LLVMBuildAnd(builder, count,
- LLVMConstInt(ctx->i32, 0x7f, 0), "");
+ LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
ena = LLVMBuildICmp(builder, LLVMIntULT,
ac_get_thread_id(&ctx->ac), count, "");
ac_build_ifcc(&ctx->ac, ena, 6506);
if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
if (LLVMGetPointerAddressSpace(param_type) ==
AC_ADDR_SPACE_CONST_32BIT) {
- arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
+ arg = LLVMBuildBitCast(builder, arg, ctx->ac.i32, "");
arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
} else {
- arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
+ arg = LLVMBuildBitCast(builder, arg, ctx->ac.i64, "");
arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
}
} else {
assert(num_out < ARRAY_SIZE(out));
out[num_out++] = val;
- if (LLVMTypeOf(val) == ctx->i32) {
+ if (LLVMTypeOf(val) == ctx->ac.i32) {
assert(num_out_sgpr + 1 == num_out);
num_out_sgpr = num_out;
}
return NULL;
}
+/* Set the context to a certain shader. Can be called repeatedly
+ * to change the shader. */
+static void si_shader_context_set_ir(struct si_shader_context *ctx,
+ struct si_shader *shader)
+{
+ struct si_shader_selector *sel = shader->selector;
+ const struct si_shader_info *info = &sel->info;
+
+ ctx->shader = shader;
+ ctx->type = sel->type;
+
+ ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
+ ctx->num_shader_buffers = util_last_bit(info->shader_buffers_declared);
+
+ ctx->num_samplers = util_last_bit(info->samplers_declared);
+ ctx->num_images = util_last_bit(info->images_declared);
+}
+
int si_compile_shader(struct si_screen *sscreen,
struct ac_llvm_compiler *compiler,
struct si_shader *shader,
}
si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
- si_llvm_context_set_ir(&ctx, shader);
+ si_shader_context_set_ir(&ctx, shader);
memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
sizeof(shader->info.vs_output_param_offset));
shader->info.uses_instanceid = sel->info.uses_instanceid;
- if (!si_build_main_function(&ctx, nir, free_nir)) {
+ LLVMValueRef ngg_cull_main_fn = NULL;
+ if (ctx.shader->key.opt.ngg_culling) {
+ if (!si_build_main_function(&ctx, nir, false, true)) {
+ si_llvm_dispose(&ctx);
+ return -1;
+ }
+ ngg_cull_main_fn = ctx.main_fn;
+ ctx.main_fn = NULL;
+ /* Re-set the IR. */
+ si_shader_context_set_ir(&ctx, shader);
+ }
+
+ if (!si_build_main_function(&ctx, nir, free_nir, false)) {
si_llvm_dispose(&ctx);
return -1;
}
if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
- LLVMValueRef parts[2];
- bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
-
- parts[1] = ctx.main_fn;
+ LLVMValueRef parts[4];
+ unsigned num_parts = 0;
+ bool has_prolog = false;
+ LLVMValueRef main_fn = ctx.main_fn;
+
+ if (ngg_cull_main_fn) {
+ if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+ &shader->key, true)) {
+ union si_shader_part_key prolog_key;
+ si_get_vs_prolog_key(&sel->info,
+ shader->info.num_input_sgprs,
+ true,
+ &shader->key.part.vs.prolog,
+ shader, &prolog_key);
+ prolog_key.vs_prolog.is_monolithic = true;
+ si_build_vs_prolog_function(&ctx, &prolog_key);
+ parts[num_parts++] = ctx.main_fn;
+ has_prolog = true;
+ }
+ parts[num_parts++] = ngg_cull_main_fn;
+ }
- if (need_prolog) {
+ if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog,
+ &shader->key, false)) {
union si_shader_part_key prolog_key;
si_get_vs_prolog_key(&sel->info,
shader->info.num_input_sgprs,
+ false,
&shader->key.part.vs.prolog,
shader, &prolog_key);
prolog_key.vs_prolog.is_monolithic = true;
si_build_vs_prolog_function(&ctx, &prolog_key);
- parts[0] = ctx.main_fn;
+ parts[num_parts++] = ctx.main_fn;
+ has_prolog = true;
}
+ parts[num_parts++] = main_fn;
- si_build_wrapper_function(&ctx, parts + !need_prolog,
- 1 + need_prolog, need_prolog, 0);
+ si_build_wrapper_function(&ctx, parts, num_parts,
+ has_prolog ? 1 : 0, 0);
if (ctx.shader->key.opt.vs_as_prim_discard_cs)
si_build_prim_discard_compute_shader(&ctx);
+ } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
+ ngg_cull_main_fn) {
+ LLVMValueRef parts[2];
+
+ parts[0] = ngg_cull_main_fn;
+ parts[1] = ctx.main_fn;
+
+ si_build_wrapper_function(&ctx, parts, 2, 0, 0);
} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
if (sscreen->info.chip_class >= GFX9) {
struct si_shader_selector *ls = shader->key.part.tcs.ls;
LLVMValueRef parts[4];
bool vs_needs_prolog =
- si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
+ si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog,
+ &shader->key, false);
/* TCS main part */
parts[2] = ctx.main_fn;
shader_ls.key.mono = shader->key.mono;
shader_ls.key.opt = shader->key.opt;
shader_ls.is_monolithic = true;
- si_llvm_context_set_ir(&ctx, &shader_ls);
+ si_shader_context_set_ir(&ctx, &shader_ls);
- if (!si_build_main_function(&ctx, nir, free_nir)) {
+ if (!si_build_main_function(&ctx, nir, free_nir, false)) {
si_llvm_dispose(&ctx);
return -1;
}
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&ls->info,
shader_ls.info.num_input_sgprs,
+ false,
&shader->key.part.tcs.ls_prolog,
shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
shader_es.key.mono = shader->key.mono;
shader_es.key.opt = shader->key.opt;
shader_es.is_monolithic = true;
- si_llvm_context_set_ir(&ctx, &shader_es);
+ si_shader_context_set_ir(&ctx, &shader_es);
- if (!si_build_main_function(&ctx, nir, free_nir)) {
+ if (!si_build_main_function(&ctx, nir, free_nir, false)) {
si_llvm_dispose(&ctx);
return -1;
}
/* ES prolog */
if (es->type == PIPE_SHADER_VERTEX &&
- si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
+ si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog,
+ &shader->key, false)) {
union si_shader_part_key vs_prolog_key;
si_get_vs_prolog_key(&es->info,
shader_es.info.num_input_sgprs,
+ false,
&shader->key.part.gs.vs_prolog,
shader, &vs_prolog_key);
vs_prolog_key.vs_prolog.is_monolithic = true;
/* Compile to bytecode. */
r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
- ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
- si_get_shader_name(shader),
+ &ctx.ac, debug, ctx.type, si_get_shader_name(shader),
si_should_optimize_less(compiler, shader->selector));
si_llvm_dispose(&ctx);
if (r) {
si_llvm_optimize_module(&ctx);
if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
- ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
- name, false)) {
+ &ctx.ac, debug, ctx.type, name, false)) {
FREE(result);
result = NULL;
goto out;
LLVMValueRef ret, func;
int num_returns, i;
unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
- unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+ unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4 +
+ (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
- struct ac_arg input_vgpr_param[9];
- LLVMValueRef input_vgprs[9];
+ struct ac_arg input_vgpr_param[13];
+ LLVMValueRef input_vgprs[13];
unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
num_input_vgprs;
unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
&input_sgpr_param[i]);
- returns[num_returns++] = ctx->i32;
+ returns[num_returns++] = ctx->ac.i32;
}
struct ac_arg merged_wave_info = input_sgpr_param[3];
/* Preloaded VGPRs (outputs must be floats) */
for (i = 0; i < num_input_vgprs; i++) {
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
- returns[num_returns++] = ctx->f32;
+ returns[num_returns++] = ctx->ac.f32;
}
/* Vertex load indices. */
for (i = 0; i < key->vs_prolog.num_inputs; i++)
- returns[num_returns++] = ctx->f32;
+ returns[num_returns++] = ctx->ac.f32;
/* Create the function. */
si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
LLVMValueRef has_hs_threads =
LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
- ctx->i32_0, "");
+ ctx->ac.i32_0, "");
for (i = 4; i > 0; --i) {
input_vgprs[i + 1] =
}
}
+ if (key->vs_prolog.gs_fast_launch_tri_list ||
+ key->vs_prolog.gs_fast_launch_tri_strip) {
+ LLVMValueRef wave_id, thread_id_in_tg;
+
+ wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
+ thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
+ ac_get_thread_id(&ctx->ac));
+
+ /* The GS fast launch initializes all VGPRs to the value of
+ * the first thread, so we have to add the thread ID.
+ *
+ * Only these are initialized by the hw:
+ * VGPR2: Base Primitive ID
+ * VGPR5: Base Vertex ID
+ * VGPR6: Instance ID
+ */
+
+ /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
+ * The NGG cull shader will read them from there.
+ */
+ if (key->vs_prolog.gs_fast_launch_tri_list) {
+ input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
+ LLVMConstInt(ctx->ac.i32, 0, 0));
+ input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
+ LLVMConstInt(ctx->ac.i32, 1, 0));
+ input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
+ LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
+ LLVMConstInt(ctx->ac.i32, 2, 0));
+ } else {
+ assert(key->vs_prolog.gs_fast_launch_tri_strip);
+ LLVMBuilderRef builder = ctx->ac.builder;
+ /* Triangle indices: */
+ LLVMValueRef index[3] = {
+ thread_id_in_tg,
+ LLVMBuildAdd(builder, thread_id_in_tg,
+ LLVMConstInt(ctx->ac.i32, 1, 0), ""),
+ LLVMBuildAdd(builder, thread_id_in_tg,
+ LLVMConstInt(ctx->ac.i32, 2, 0), ""),
+ };
+ LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
+ thread_id_in_tg, ctx->ac.i1, "");
+ LLVMValueRef flatshade_first =
+ LLVMBuildICmp(builder, LLVMIntEQ,
+ si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
+ ctx->ac.i32_0, "");
+
+ ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
+ flatshade_first, index);
+ input_vgprs[0] = index[0];
+ input_vgprs[1] = index[1];
+ input_vgprs[4] = index[2];
+ }
+
+ /* Triangles always have all edge flags set initially. */
+ input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
+
+ input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
+ thread_id_in_tg, ""); /* PrimID */
+ input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
+ thread_id_in_tg, ""); /* VertexID */
+ input_vgprs[8] = input_vgprs[6]; /* InstanceID */
+ }
+
unsigned vertex_id_vgpr = first_vs_vgpr;
unsigned instance_id_vgpr =
ctx->screen->info.chip_class >= GFX10 ?
*/
if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
- LLVMConstInt(ctx->i32, 16, 0), "");
+ LLVMConstInt(ctx->ac.i32, 16, 0), "");
ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
- LLVMConstInt(ctx->i32, 0xffff, 0), "");
+ LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
}
/* Copy inputs to outputs. This should be no-op, as the registers match,
if (key->vs_prolog.states.instance_divisor_is_fetched) {
LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
LLVMValueRef buf_index =
- LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+ LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
instance_divisor_constbuf =
ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
}
for (unsigned j = 0; j < 4; j++) {
udiv_factors[j] =
si_buffer_load_const(ctx, instance_divisor_constbuf,
- LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+ LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
}
/* The faster NUW version doesn't work when InstanceID == UINT_MAX.
{
struct si_shader_selector *vs = main_part->selector;
- if (!si_vs_needs_prolog(vs, key))
+ if (!si_vs_needs_prolog(vs, key, &shader->key, false))
return true;
/* Get the prolog. */
union si_shader_part_key prolog_key;
- si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
+ si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false,
key, shader, &prolog_key);
shader->prolog =
return true;
}
+void si_shader_binary_clean(struct si_shader_binary *binary)
+{
+ free((void *)binary->elf_buffer);
+ binary->elf_buffer = NULL;
+
+ free(binary->llvm_ir_string);
+ binary->llvm_ir_string = NULL;
+}
+
void si_shader_destroy(struct si_shader *shader)
{
if (shader->scratch_bo)