int param_tes_rel_patch_id;
int param_tes_patch_id;
int param_es2gs_offset;
+ int param_oc_lds;
+
+ /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
+ * 0x800000 for VS, 0x1 for ES.
+ */
+ int param_tess_offchip;
LLVMTargetMachineRef tm;
struct si_shader *shader,
LLVMTargetMachineRef tm);
+static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
+ struct lp_build_tgsi_context *bld_base,
+ struct lp_build_emit_data *emit_data);
+
/* Ideally pass the sample mask input to the PS epilog as v13, which
* is its usual location, so that the shader doesn't have to add v_mov.
*/
LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
LLVMValueRef cc;
+ /* LLVM 3.8: If indirect resource indexing is used:
+ * - SI & CIK hang
+ * - VI crashes
+ */
+ if (HAVE_LLVM <= 0x0308)
+ return LLVMGetUndef(ctx->i32);
+
if (util_is_power_of_two(num)) {
result = LLVMBuildAnd(builder, result, c_max, "");
} else {
lp_build_const_int32(gallivm, param * 4), "");
}
+/* The offchip buffer layout for TCS->TES is
+ *
+ * - attribute 0 of patch 0 vertex 0
+ * - attribute 0 of patch 0 vertex 1
+ * - attribute 0 of patch 0 vertex 2
+ * ...
+ * - attribute 0 of patch 1 vertex 0
+ * - attribute 0 of patch 1 vertex 1
+ * ...
+ * - attribute 1 of patch 0 vertex 0
+ * - attribute 1 of patch 0 vertex 1
+ * ...
+ * - per patch attribute 0 of patch 0
+ * - per patch attribute 0 of patch 1
+ * ...
+ *
+ * Note that every attribute has 4 components.
+ */
+static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
+ LLVMValueRef vertex_index,
+ LLVMValueRef param_index)
+{
+ struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
+ LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
+ LLVMValueRef param_stride, constant16;
+
+ vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
+ num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
+ total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
+ num_patches, "");
+
+ constant16 = lp_build_const_int32(gallivm, 16);
+ if (vertex_index) {
+ base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
+ vertices_per_patch, "");
+
+ base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+ vertex_index, "");
+
+ param_stride = total_vertices;
+ } else {
+ base_addr = get_rel_patch_id(ctx);
+ param_stride = num_patches;
+ }
+
+ base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+ LLVMBuildMul(gallivm->builder, param_index,
+ param_stride, ""), "");
+
+ base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
+
+ if (!vertex_index) {
+ LLVMValueRef patch_data_offset =
+ unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
+
+ base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+ patch_data_offset, "");
+ }
+ return base_addr;
+}
+
+static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
+ struct si_shader_context *ctx,
+ const struct tgsi_full_dst_register *dst,
+ const struct tgsi_full_src_register *src)
+{
+ struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
+ struct tgsi_shader_info *info = &ctx->shader->selector->info;
+ ubyte *name, *index, *array_first;
+ struct tgsi_full_src_register reg;
+ LLVMValueRef vertex_index = NULL;
+ LLVMValueRef param_index = NULL;
+ unsigned param_index_base, param_base;
+
+ reg = src ? *src : tgsi_full_src_register_from_dst(dst);
+
+ if (reg.Register.Dimension) {
+
+ if (reg.Dimension.Indirect)
+ vertex_index = get_indirect_index(ctx, ®.DimIndirect,
+ reg.Dimension.Index);
+ else
+ vertex_index = lp_build_const_int32(gallivm,
+ reg.Dimension.Index);
+ }
+
+ /* Get information about the register. */
+ if (reg.Register.File == TGSI_FILE_INPUT) {
+ name = info->input_semantic_name;
+ index = info->input_semantic_index;
+ array_first = info->input_array_first;
+ } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
+ name = info->output_semantic_name;
+ index = info->output_semantic_index;
+ array_first = info->output_array_first;
+ } else {
+ assert(0);
+ return NULL;
+ }
+
+ if (reg.Register.Indirect) {
+ if (reg.Indirect.ArrayID)
+ param_base = array_first[reg.Indirect.ArrayID];
+ else
+ param_base = reg.Register.Index;
+
+ param_index = get_indirect_index(ctx, ®.Indirect,
+ reg.Register.Index - param_base);
+
+ } else {
+ param_base = reg.Register.Index;
+ param_index = lp_build_const_int32(gallivm, 0);
+ }
+
+ param_index_base = si_shader_io_get_unique_index(name[param_base],
+ index[param_base]);
+
+ param_index = LLVMBuildAdd(gallivm->builder, param_index,
+ lp_build_const_int32(gallivm, param_index_base),
+ "");
+
+ return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
+}
+
+/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
+ * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
+ * or v4i32 (num_channels=3,4). */
+static void build_tbuffer_store(struct si_shader_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vdata,
+ unsigned num_channels,
+ LLVMValueRef vaddr,
+ LLVMValueRef soffset,
+ unsigned inst_offset,
+ unsigned dfmt,
+ unsigned nfmt,
+ unsigned offen,
+ unsigned idxen,
+ unsigned glc,
+ unsigned slc,
+ unsigned tfe)
+{
+ struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+ LLVMValueRef args[] = {
+ rsrc,
+ vdata,
+ LLVMConstInt(ctx->i32, num_channels, 0),
+ vaddr,
+ soffset,
+ LLVMConstInt(ctx->i32, inst_offset, 0),
+ LLVMConstInt(ctx->i32, dfmt, 0),
+ LLVMConstInt(ctx->i32, nfmt, 0),
+ LLVMConstInt(ctx->i32, offen, 0),
+ LLVMConstInt(ctx->i32, idxen, 0),
+ LLVMConstInt(ctx->i32, glc, 0),
+ LLVMConstInt(ctx->i32, slc, 0),
+ LLVMConstInt(ctx->i32, tfe, 0)
+ };
+
+ /* The instruction offset field has 12 bits */
+ assert(offen || inst_offset < (1 << 12));
+
+ /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+ const char *types[] = {"i32", "v2i32", "v4i32"};
+ char name[256];
+ snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+
+ lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
+ args, ARRAY_SIZE(args), 0);
+}
+
+static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vdata,
+ unsigned num_channels,
+ LLVMValueRef vaddr,
+ LLVMValueRef soffset,
+ unsigned inst_offset)
+{
+ static unsigned dfmt[] = {
+ V_008F0C_BUF_DATA_FORMAT_32,
+ V_008F0C_BUF_DATA_FORMAT_32_32,
+ V_008F0C_BUF_DATA_FORMAT_32_32_32,
+ V_008F0C_BUF_DATA_FORMAT_32_32_32_32
+ };
+ assert(num_channels >= 1 && num_channels <= 4);
+
+ build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
+ inst_offset, dfmt[num_channels-1],
+ V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
+}
+
+static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
+ LLVMValueRef rsrc,
+ int num_channels,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ unsigned inst_offset,
+ unsigned glc,
+ unsigned slc)
+{
+ struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+ if (HAVE_LLVM >= 0x309) {
+ LLVMValueRef args[] = {
+ LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
+ vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
+ LLVMConstInt(ctx->i32, inst_offset, 0),
+ LLVMConstInt(ctx->i1, glc, 0),
+ LLVMConstInt(ctx->i1, slc, 0)
+ };
+
+ LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
+ ctx->v4f32};
+ const char *type_names[] = {"f32", "v2f32", "v4f32"};
+ char name[256];
+
+ if (voffset) {
+ args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
+ "");
+ }
+
+ if (soffset) {
+ args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
+ "");
+ }
+
+ snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
+ type_names[func]);
+
+ return lp_build_intrinsic(gallivm->builder, name, types[func], args,
+ ARRAY_SIZE(args), LLVMReadOnlyAttribute |
+ LLVMNoUnwindAttribute);
+ } else {
+ LLVMValueRef args[] = {
+ LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
+ voffset ? voffset : vindex,
+ soffset,
+ LLVMConstInt(ctx->i32, inst_offset, 0),
+ LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
+ LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
+ LLVMConstInt(ctx->i32, glc, 0),
+ LLVMConstInt(ctx->i32, slc, 0),
+ LLVMConstInt(ctx->i32, 0, 0), // TFE
+ };
+
+ LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
+ ctx->v4i32};
+ const char *type_names[] = {"i32", "v2i32", "v4i32"};
+ const char *arg_type = "i32";
+ char name[256];
+
+ if (voffset && vindex) {
+ LLVMValueRef vaddr[] = {vindex, voffset};
+
+ arg_type = "v2i32";
+ args[1] = lp_build_gather_values(gallivm, vaddr, 2);
+ }
+
+ snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
+ type_names[func], arg_type);
+
+ return lp_build_intrinsic(gallivm->builder, name, types[func], args,
+ ARRAY_SIZE(args), LLVMReadOnlyAttribute |
+ LLVMNoUnwindAttribute);
+ }
+}
+
+static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
+ enum tgsi_opcode_type type, unsigned swizzle,
+ LLVMValueRef buffer, LLVMValueRef offset,
+ LLVMValueRef base)
+{
+ struct si_shader_context *ctx = si_shader_context(bld_base);
+ struct gallivm_state *gallivm = bld_base->base.gallivm;
+ LLVMValueRef value, value2;
+ LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
+ LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
+
+ if (swizzle == ~0) {
+ value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
+ 0, 1, 0);
+
+ return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
+ }
+
+ if (!tgsi_type_is_64bit(type)) {
+ value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
+ 0, 1, 0);
+
+ value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
+ return LLVMBuildExtractElement(gallivm->builder, value,
+ lp_build_const_int32(gallivm, swizzle), "");
+ }
+
+ value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
+ swizzle * 4, 1, 0);
+
+ value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
+ swizzle * 4 + 4, 1, 0);
+
+ return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
+}
+
/**
* Load from LDS.
*
lp_build_const_int32(gallivm, swizzle));
value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
- if (type == TGSI_TYPE_DOUBLE) {
+ if (tgsi_type_is_64bit(type)) {
LLVMValueRef value2;
dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
lp_build_const_int32(gallivm, swizzle + 1));
value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
- return radeon_llvm_emit_fetch_double(bld_base, value, value2);
+ return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
}
return LLVMBuildBitCast(gallivm->builder, value,
enum tgsi_opcode_type type, unsigned swizzle)
{
struct si_shader_context *ctx = si_shader_context(bld_base);
- LLVMValueRef dw_addr, stride;
+ struct gallivm_state *gallivm = bld_base->base.gallivm;
+ LLVMValueRef rw_buffers, buffer, base, addr;
- if (reg->Register.Dimension) {
- stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
- dw_addr = get_tcs_out_current_patch_offset(ctx);
- dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
- } else {
- dw_addr = get_tcs_out_current_patch_data_offset(ctx);
- dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
- }
+ rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_RW_BUFFERS);
+ buffer = build_indexed_load_const(ctx, rw_buffers,
+ lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
- return lds_load(bld_base, type, swizzle, dw_addr);
+ base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
+ addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
+
+ return buffer_load(bld_base, type, swizzle, buffer, base, addr);
}
static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
LLVMValueRef dst[4])
{
struct si_shader_context *ctx = si_shader_context(bld_base);
+ struct gallivm_state *gallivm = bld_base->base.gallivm;
const struct tgsi_full_dst_register *reg = &inst->Dst[0];
unsigned chan_index;
LLVMValueRef dw_addr, stride;
+ LLVMValueRef rw_buffers, buffer, base, buf_addr;
+ LLVMValueRef values[4];
/* Only handle per-patch and per-vertex outputs here.
* Vectors will be lowered to scalars and this function will be called again.
dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
}
+ rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_RW_BUFFERS);
+ buffer = build_indexed_load_const(ctx, rw_buffers,
+ lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
+
+ base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
+ buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
+
+
TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
LLVMValueRef value = dst[chan_index];
value = radeon_llvm_saturate(bld_base, value);
lds_store(bld_base, chan_index, dw_addr, value);
+
+ value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
+ values[chan_index] = value;
+
+ if (inst->Dst[0].Register.WriteMask != 0xF) {
+ build_tbuffer_store_dwords(ctx, buffer, value, 1,
+ buf_addr, base,
+ 4 * chan_index);
+ }
+ }
+
+ if (inst->Dst[0].Register.WriteMask == 0xF) {
+ LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
+ values, 4);
+ build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
+ base, 0);
}
}
"llvm.SI.buffer.load.dword.i32.i32",
ctx->i32, args, 9,
LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
- if (type == TGSI_TYPE_DOUBLE) {
+ if (tgsi_type_is_64bit(type)) {
LLVMValueRef value2;
args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
value2 = lp_build_intrinsic(gallivm->builder,
"llvm.SI.buffer.load.dword.i32.i32",
ctx->i32, args, 9,
LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
- return radeon_llvm_emit_fetch_double(bld_base,
- value, value2);
+ return radeon_llvm_emit_fetch_64bit(bld_base, type,
+ value, value2);
}
return LLVMBuildBitCast(gallivm->builder,
value,
case TGSI_SEMANTIC_TESSINNER:
case TGSI_SEMANTIC_TESSOUTER:
{
- LLVMValueRef dw_addr;
+ LLVMValueRef rw_buffers, buffer, base, addr;
int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
- dw_addr = get_tcs_out_current_patch_data_offset(ctx);
- dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
- lp_build_const_int32(gallivm, param * 4), "");
+ rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
+ SI_PARAM_RW_BUFFERS);
+ buffer = build_indexed_load_const(ctx, rw_buffers,
+ lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
+
+ base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
+ addr = get_tcs_tes_buffer_address(ctx, NULL,
+ lp_build_const_int32(gallivm, param));
+
+ value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
+ ~0, buffer, base, addr);
- value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
- ~0, dw_addr);
break;
}
idx = reg->Register.Index * 4 + swizzle;
if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
- if (type != TGSI_TYPE_DOUBLE)
+ if (!tgsi_type_is_64bit(type))
return bitcast(bld_base, type, ctx->constants[buf][idx]);
else {
- return radeon_llvm_emit_fetch_double(bld_base,
- ctx->constants[buf][idx],
- ctx->constants[buf][idx + 1]);
+ return radeon_llvm_emit_fetch_64bit(bld_base, type,
+ ctx->constants[buf][idx],
+ ctx->constants[buf][idx + 1]);
}
}
result = buffer_load_const(base->gallivm->builder, bufp,
addr, ctx->f32);
- if (type != TGSI_TYPE_DOUBLE)
+ if (!tgsi_type_is_64bit(type))
result = bitcast(bld_base, type, result);
else {
LLVMValueRef addr2, result2;
result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
addr2, ctx->f32);
- result = radeon_llvm_emit_fetch_double(bld_base,
- result, result2);
+ result = radeon_llvm_emit_fetch_64bit(bld_base, type,
+ result, result2);
}
return result;
}
}
}
-/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
- * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
- * or v4i32 (num_channels=3,4). */
-static void build_tbuffer_store(struct si_shader_context *ctx,
- LLVMValueRef rsrc,
- LLVMValueRef vdata,
- unsigned num_channels,
- LLVMValueRef vaddr,
- LLVMValueRef soffset,
- unsigned inst_offset,
- unsigned dfmt,
- unsigned nfmt,
- unsigned offen,
- unsigned idxen,
- unsigned glc,
- unsigned slc,
- unsigned tfe)
-{
- struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
- LLVMValueRef args[] = {
- rsrc,
- vdata,
- LLVMConstInt(ctx->i32, num_channels, 0),
- vaddr,
- soffset,
- LLVMConstInt(ctx->i32, inst_offset, 0),
- LLVMConstInt(ctx->i32, dfmt, 0),
- LLVMConstInt(ctx->i32, nfmt, 0),
- LLVMConstInt(ctx->i32, offen, 0),
- LLVMConstInt(ctx->i32, idxen, 0),
- LLVMConstInt(ctx->i32, glc, 0),
- LLVMConstInt(ctx->i32, slc, 0),
- LLVMConstInt(ctx->i32, tfe, 0)
- };
-
- /* The instruction offset field has 12 bits */
- assert(offen || inst_offset < (1 << 12));
-
- /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
- unsigned func = CLAMP(num_channels, 1, 3) - 1;
- const char *types[] = {"i32", "v2i32", "v4i32"};
- char name[256];
- snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
-
- lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
- args, ARRAY_SIZE(args), 0);
-}
-
-static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
- LLVMValueRef rsrc,
- LLVMValueRef vdata,
- unsigned num_channels,
- LLVMValueRef vaddr,
- LLVMValueRef soffset,
- unsigned inst_offset)
-{
- static unsigned dfmt[] = {
- V_008F0C_BUF_DATA_FORMAT_32,
- V_008F0C_BUF_DATA_FORMAT_32_32,
- V_008F0C_BUF_DATA_FORMAT_32_32_32,
- V_008F0C_BUF_DATA_FORMAT_32_32_32_32
- };
- assert(num_channels >= 1 && num_channels <= 4);
-
- build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
- inst_offset, dfmt[num_channels-1],
- V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
-}
-
/* On SI, the vertex shader is responsible for writing streamout data
* to buffers. */
static void si_llvm_emit_streamout(struct si_shader_context *ctx,
}
}
+static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
+{
+ struct si_shader_context *ctx = si_shader_context(bld_base);
+ struct gallivm_state *gallivm = bld_base->base.gallivm;
+ LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
+ LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
+ uint64_t inputs;
+
+ invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
+
+ rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
+ buffer = build_indexed_load_const(ctx, rw_buffers,
+ lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
+
+ buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
+
+ lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+ lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
+ lds_vertex_stride, "");
+ lds_base = get_tcs_in_current_patch_offset(ctx);
+ lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
+
+ inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
+ while (inputs) {
+ unsigned i = u_bit_scan64(&inputs);
+
+ LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
+ lp_build_const_int32(gallivm, 4 * i),
+ "");
+
+ LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
+ invocation_id,
+ lp_build_const_int32(gallivm, i));
+
+ LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
+ lds_ptr);
+
+ build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
+ buffer_offset, 0);
+ }
+}
+
static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
LLVMValueRef rel_patch_id,
LLVMValueRef invocation_id,
LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
unsigned stride, outer_comps, inner_comps, i;
- struct lp_build_if_state if_ctx;
+ struct lp_build_if_state if_ctx, inner_if_ctx;
+
+ si_llvm_emit_barrier(NULL, bld_base, NULL);
/* Do this only for invocation 0, because the tess levels are per-patch,
* not per-vertex.
byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
lp_build_const_int32(gallivm, 4 * stride), "");
- /* Store the outputs. */
+ lp_build_if(&inner_if_ctx, gallivm,
+ LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
+ rel_patch_id, bld_base->uint_bld.zero, ""));
+
+ /* Store the dynamic HS control word. */
+ build_tbuffer_store_dwords(ctx, buffer,
+ lp_build_const_int32(gallivm, 0x80000000),
+ 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
+
+ lp_build_endif(&inner_if_ctx);
+
+ /* Store the tessellation factors. */
build_tbuffer_store_dwords(ctx, buffer, vec0,
- MIN2(stride, 4), byteoffset, tf_base, 0);
+ MIN2(stride, 4), byteoffset, tf_base, 4);
if (vec1)
build_tbuffer_store_dwords(ctx, buffer, vec1,
- stride - 4, byteoffset, tf_base, 16);
+ stride - 4, byteoffset, tf_base, 20);
lp_build_endif(&if_ctx);
}
tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
SI_PARAM_TESS_FACTOR_OFFSET);
ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
- SI_TCS_NUM_USER_SGPR, "");
+ SI_TCS_NUM_USER_SGPR + 1, "");
/* VGPRs */
rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
- vgpr = SI_TCS_NUM_USER_SGPR + 1;
+ vgpr = SI_TCS_NUM_USER_SGPR + 2;
ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
return;
}
+ si_copy_tcs_inputs(bld_base);
si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
}
struct lp_build_tgsi_context *bld_base,
struct lp_build_emit_data *emit_data)
{
+ struct si_shader_context *ctx = si_shader_context(bld_base);
struct lp_build_context *base = &bld_base->base;
unsigned opcode = emit_data->inst->Instruction.Opcode;
unsigned target = emit_data->inst->Texture.Texture;
case TGSI_OPCODE_TEX:
case TGSI_OPCODE_TEX2:
case TGSI_OPCODE_TXP:
+ if (ctx->type != PIPE_SHADER_FRAGMENT)
+ infix = ".lz";
break;
case TGSI_OPCODE_TXB:
case TGSI_OPCODE_TXB2:
+ assert(ctx->type == PIPE_SHADER_FRAGMENT);
infix = ".b";
break;
case TGSI_OPCODE_TXL:
break;
case TGSI_OPCODE_TG4:
name = "llvm.SI.gather4";
+ infix = ".lz";
break;
default:
assert(0);
else
LLVMAddAttribute(P, LLVMInRegAttribute);
}
+
+ if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
+ /* These were copied from some LLVM test. */
+ LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
+ "less-precise-fpmad",
+ "true");
+ LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
+ "no-infs-fp-math",
+ "true");
+ LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
+ "no-nans-fp-math",
+ "true");
+ LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
+ "unsafe-fp-math",
+ "true");
+ }
}
static void create_meta_data(struct si_shader_context *ctx)
/* Streamout SGPRs. */
if (so->num_outputs) {
- params[ctx->param_streamout_config = (*num_params)++] = i32;
+ if (ctx->type != PIPE_SHADER_TESS_EVAL)
+ params[ctx->param_streamout_config = (*num_params)++] = i32;
+ else
+ ctx->param_streamout_config = ctx->param_tess_offchip;
+
params[ctx->param_streamout_write_index = (*num_params)++] = i32;
}
/* A streamout buffer offset is loaded if the stride is non-zero. */
break;
case PIPE_SHADER_TESS_CTRL:
+ params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
+ params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
num_params = SI_PARAM_REL_IDS+1;
if (!ctx->is_monolithic) {
- /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
- for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
+ /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
+ * placed after the user SGPRs.
+ */
+ for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
returns[num_returns++] = ctx->i32; /* SGPRs */
for (i = 0; i < 3; i++)
break;
case PIPE_SHADER_TESS_EVAL:
- params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
- params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
- num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
+ params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
+ num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
if (shader->key.tes.as_es) {
+ params[ctx->param_oc_lds = num_params++] = ctx->i32;
+ params[ctx->param_tess_offchip = num_params++] = ctx->i32;
params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
} else {
+ params[ctx->param_tess_offchip = num_params++] = ctx->i32;
declare_streamout_params(ctx, &shader->selector->so,
params, ctx->i32, &num_params);
+ params[ctx->param_oc_lds = num_params++] = ctx->i32;
}
last_sgpr = num_params - 1;
unsigned i;
const unsigned char *config =
radeon_shader_binary_config_start(binary, symbol_offset);
+ bool really_needs_scratch = false;
+
+ /* LLVM adds SGPR spills to the scratch size.
+ * Find out if we really need the scratch buffer.
+ */
+ for (i = 0; i < binary->reloc_count; i++) {
+ const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+
+ if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
+ !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+ really_needs_scratch = true;
+ break;
+ }
+ }
/* XXX: We may be able to emit some of these values directly rather than
* extracting fields to be emitted later.
case R_0286E8_SPI_TMPRING_SIZE:
case R_00B860_COMPUTE_TMPRING_SIZE:
/* WAVESIZE is in units of 256 dwords. */
- conf->scratch_bytes_per_wave =
- G_00B860_WAVESIZE(value) * 256 * 4 * 1;
+ if (really_needs_scratch)
+ conf->scratch_bytes_per_wave =
+ G_00B860_WAVESIZE(value) * 256 * 4;
break;
default:
{
unsigned i;
uint32_t scratch_rsrc_dword0 = scratch_va;
uint32_t scratch_rsrc_dword1 =
- S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
- | S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
+ S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
+
+ /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
+ * correctly.
+ */
+ if (HAVE_LLVM >= 0x0309)
+ scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
+ else
+ scratch_rsrc_dword1 |=
+ S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
for (i = 0 ; i < shader->binary.reloc_count; i++) {
const struct radeon_shader_reloc *reloc =
unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
unsigned lds_per_wave = 0;
unsigned max_simd_waves = 10;
+ /* Assuming SGPRs aren't spilled. */
+ unsigned spilled_vgprs = conf->scratch_bytes_per_wave / 64 / 4;
/* Compute LDS usage for PS. */
if (processor == PIPE_SHADER_FRAGMENT) {
fprintf(file, "*** SHADER STATS ***\n"
"SGPRS: %d\n"
"VGPRS: %d\n"
+ "Spilled VGPRs: %d\n"
"Code Size: %d bytes\n"
"LDS: %d blocks\n"
"Scratch: %d bytes per wave\n"
"Max Waves: %d\n"
"********************\n",
- conf->num_sgprs, conf->num_vgprs, code_size,
+ conf->num_sgprs, conf->num_vgprs, spilled_vgprs, code_size,
conf->lds_size, conf->scratch_bytes_per_wave,
max_simd_waves);
}
pipe_debug_message(debug, SHADER_INFO,
"Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
- "LDS: %d Scratch: %d Max Waves: %d",
+ "LDS: %d Scratch: %d Max Waves: %d Spilled VGPRs: %d",
conf->num_sgprs, conf->num_vgprs, code_size,
conf->lds_size, conf->scratch_bytes_per_wave,
- max_simd_waves);
+ max_simd_waves, spilled_vgprs);
}
static const char *si_get_shader_name(struct si_shader *shader,
}
if (!si_replace_shader(count, binary)) {
- r = radeon_llvm_compile(mod, binary,
- r600_get_llvm_processor_name(sscreen->b.family), tm,
- debug);
+ r = radeon_llvm_compile(mod, binary, tm, debug);
if (r)
return r;
}
return r;
}
+static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
+{
+ if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
+ LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
+ else
+ LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
+}
+
/* Generate code for the hardware VS shader stage to go with a geometry shader */
static int si_generate_gs_copy_shader(struct si_screen *sscreen,
struct si_shader_context *ctx,
si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
- LLVMBuildRet(gallivm->builder, ctx->return_value);
+ LLVMBuildRetVoid(gallivm->builder);
/* Dump LLVM IR before any optimization passes */
if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
goto out;
}
- LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
+ si_llvm_build_ret(&ctx, ctx.return_value);
mod = bld_base->base.gallivm->module;
/* Dump LLVM IR before any optimization passes */
}
/* Compile. */
- LLVMBuildRet(gallivm->builder, ret);
+ si_llvm_build_ret(&ctx, ret);
radeon_llvm_finalize_module(&ctx.radeon_bld);
if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
}
/* Compile. */
- LLVMBuildRet(gallivm->builder, ctx.return_value);
+ LLVMBuildRetVoid(gallivm->builder);
radeon_llvm_finalize_module(&ctx.radeon_bld);
if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
params[SI_PARAM_SAMPLERS] = ctx.i64;
params[SI_PARAM_IMAGES] = ctx.i64;
params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
+ params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
+ params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
num_params = last_sgpr + 1;
LLVMGetParam(func, last_sgpr + 3));
/* Compile. */
- LLVMBuildRet(gallivm->builder, ctx.return_value);
+ LLVMBuildRetVoid(gallivm->builder);
radeon_llvm_finalize_module(&ctx.radeon_bld);
if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
linear_sample[i], base + 10 + i, "");
}
+ /* Tell LLVM to insert WQM instruction sequence when needed. */
+ if (key->ps_prolog.wqm) {
+ LLVMAddTargetDependentFunctionAttr(func,
+ "amdgpu-ps-wqm-outputs", "");
+ }
+
/* Compile. */
- LLVMBuildRet(gallivm->builder, ret);
+ si_llvm_build_ret(&ctx, ret);
radeon_llvm_finalize_module(&ctx.radeon_bld);
if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
prolog_key.ps_prolog.colors_read = info->colors_read;
prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
+ prolog_key.ps_prolog.wqm = info->uses_derivatives &&
+ (prolog_key.ps_prolog.colors_read ||
+ prolog_key.ps_prolog.states.force_persample_interp);
if (info->colors_read) {
unsigned *color = shader->selector->color_attr_index;
shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
(shader->selector->type == PIPE_SHADER_TESS_EVAL &&
shader->key.tes.as_es != mainp->key.tes.as_es) ||
+ (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
+ shader->key.tcs.epilog.inputs_to_copy) ||
shader->selector->type == PIPE_SHADER_COMPUTE) {
/* Monolithic shader (compiled as a whole, has many variants,
* may take a long time to compile).