struct pipe_resource *buffer,
unsigned stride, unsigned num_records,
bool add_tid, bool swizzle,
- unsigned element_size, unsigned index_stride)
+ unsigned element_size, unsigned index_stride, uint64_t offset)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
if (buffer) {
uint64_t va;
- va = r600_resource(buffer)->gpu_address;
+ va = r600_resource(buffer)->gpu_address + offset;
switch (element_size) {
default:
case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
return 4095;
case PIPE_CAP_MAX_VERTEX_STREAMS:
- return 1;
+ return 4;
case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
return 2048;
#include "gallivm/lp_bld_intr.h"
#include "gallivm/lp_bld_logic.h"
#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
#include "gallivm/lp_bld_flow.h"
#include "radeon/r600_cs.h"
#include "radeon/radeon_llvm.h"
LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
LLVMValueRef so_buffers[4];
LLVMValueRef esgs_ring;
- LLVMValueRef gsvs_ring;
- LLVMValueRef gs_next_vertex;
+ LLVMValueRef gsvs_ring[4];
+ LLVMValueRef gs_next_vertex[4];
};
static struct si_shader_context * si_shader_context(
LLVMValueRef can_emit =
LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
+ LLVMValueRef stream_id =
+ unpack_param(shader, shader->param_streamout_config, 24, 2);
+
/* Emit the streamout code conditionally. This actually avoids
* out-of-bounds buffer access. The hw tells us via the SGPR
* (so_vtx_count) which threads are allowed to emit streamout data. */
unsigned reg = so->output[i].register_index;
unsigned start = so->output[i].start_component;
unsigned num_comps = so->output[i].num_components;
+ unsigned stream = so->output[i].stream;
LLVMValueRef out[4];
+ struct lp_build_if_state if_ctx_stream;
assert(num_comps && num_comps <= 4);
if (!num_comps || num_comps > 4)
break;
}
+ LLVMValueRef can_emit_stream =
+ LLVMBuildICmp(builder, LLVMIntEQ,
+ stream_id,
+ lp_build_const_int32(gallivm, stream), "");
+
+ lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
vdata, num_comps,
so_write_offset[buf_idx],
LLVMConstInt(i32, 0, 0),
so->output[i].dst_offset*4);
+ lp_build_endif(&if_ctx_stream);
}
}
lp_build_endif(&if_ctx);
}
}
+static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
+ struct lp_build_emit_data *emit_data)
+{
+ LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
+ struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
+ unsigned stream;
+
+ assert(src0.File == TGSI_FILE_IMMEDIATE);
+
+ stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
+ return stream;
+}
+
/* Emit one vertex from the geometry shader */
static void si_llvm_emit_vertex(
const struct lp_build_tgsi_action *action,
LLVMValueRef args[2];
unsigned chan;
int i;
+ unsigned stream;
+
+ stream = si_llvm_get_stream(bld_base, emit_data);
/* Write vertex attribute values to GSVS ring */
- gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
+ gs_next_vertex = LLVMBuildLoad(gallivm->builder,
+ si_shader_ctx->gs_next_vertex[stream],
+ "");
/* If this thread has already emitted the declared maximum number of
* vertices, kill it: excessive vertex emissions are not supposed to
kill = lp_build_select(&bld_base->base, can_emit,
lp_build_const_float(gallivm, 1.0f),
lp_build_const_float(gallivm, -1.0f));
+
build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
build_tbuffer_store(si_shader_ctx,
- si_shader_ctx->gsvs_ring,
+ si_shader_ctx->gsvs_ring[stream],
out_val, 1,
voffset, soffset, 0,
V_008F0C_BUF_DATA_FORMAT_32,
}
gs_next_vertex = lp_build_add(uint, gs_next_vertex,
lp_build_const_int32(gallivm, 1));
- LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex);
+
+ LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]);
/* Signal vertex emission */
- args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS);
+ args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
LLVMVoidTypeInContext(gallivm->context), args, 2,
struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = bld_base->base.gallivm;
LLVMValueRef args[2];
+ unsigned stream;
/* Signal primitive cut */
- args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS);
+ stream = si_llvm_get_stream(bld_base, emit_data);
+ args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
LLVMVoidTypeInContext(gallivm->context), args, 2,
build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
}
- if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY ||
- si_shader_ctx->shader->is_gs_copy_shader) {
+ if (si_shader_ctx->shader->is_gs_copy_shader) {
LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
- si_shader_ctx->gsvs_ring =
+ si_shader_ctx->gsvs_ring[0] =
build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
}
+ if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+ int i;
+ for (i = 0; i < 4; i++) {
+ LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i);
+
+ si_shader_ctx->gsvs_ring[i] =
+ build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+ }
+ }
}
void si_shader_binary_read_config(const struct si_screen *sscreen,
preload_streamout_buffers(si_shader_ctx);
preload_ring_buffers(si_shader_ctx);
- args[0] = si_shader_ctx->gsvs_ring;
+ args[0] = si_shader_ctx->gsvs_ring[0];
args[1] = lp_build_mul_imm(uint,
LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
si_shader_ctx->param_vertex_id),
preload_ring_buffers(&si_shader_ctx);
if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
- si_shader_ctx.gs_next_vertex =
- lp_build_alloca(bld_base->base.gallivm,
- bld_base->uint_bld.elem_type, "");
+ int i;
+ for (i = 0; i < 4; i++) {
+ si_shader_ctx.gs_next_vertex[i] =
+ lp_build_alloca(bld_base->base.gallivm,
+ bld_base->uint_bld.elem_type, "");
+ }
}
if (!lp_build_tgsi_llvm(bld_base, tokens)) {
si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0);
si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
- si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0);
- si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0);
- si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0);
-
si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
#define SI_RING_TESS_FACTOR 0 /* for HS (TCS) */
#define SI_RING_ESGS 0 /* for ES, GS */
#define SI_RING_GSVS 1 /* for GS, VS */
-#define SI_NUM_RING_BUFFERS 2
+#define SI_RING_GSVS_1 2 /* 1, 2, 3 for GS */
+#define SI_RING_GSVS_2 3
+#define SI_RING_GSVS_3 4
+#define SI_NUM_RING_BUFFERS 5
#define SI_SO_BUF_OFFSET SI_NUM_RING_BUFFERS
#define SI_NUM_RW_BUFFERS (SI_SO_BUF_OFFSET + 4)
struct pipe_resource *buffer,
unsigned stride, unsigned num_records,
bool add_tid, bool swizzle,
- unsigned element_size, unsigned index_stride);
+ unsigned element_size, unsigned index_stride, uint64_t offset);
void si_init_all_descriptors(struct si_context *sctx);
void si_release_all_descriptors(struct si_context *sctx);
void si_all_descriptors_begin_new_cs(struct si_context *sctx);
si_set_tesseval_regs(shader, pm4);
}
+static unsigned si_gs_get_max_stream(struct si_shader *shader)
+{
+ struct pipe_stream_output_info *so = &shader->selector->so;
+ unsigned max_stream = 0, i;
+
+ if (so->num_outputs == 0)
+ return 0;
+
+ for (i = 0; i < so->num_outputs; i++) {
+ if (so->output[i].stream > max_stream)
+ max_stream = so->output[i].stream;
+ }
+ return max_stream;
+}
+
static void si_shader_gs(struct si_shader *shader)
{
- unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2);
+ unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
- unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+ unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
unsigned gs_num_invocations = shader->selector->gs_num_invocations;
unsigned cut_mode;
struct si_pm4_state *pm4;
unsigned num_sgprs, num_user_sgprs;
uint64_t va;
+ unsigned max_stream = si_gs_get_max_stream(shader);
/* The GSVS_RING_ITEMSIZE register takes 15 bits */
assert(gsvs_itemsize < (1 << 15));
S_028A40_GS_WRITE_OPTIMIZE(1));
si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
- si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize);
- si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize);
+ si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
+ si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
- si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
+ si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
- si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize);
+ si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
+ si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
+ si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
+ si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
sctx->esgs_ring, 0, esgs_ring_size,
- true, true, 4, 64);
+ true, true, 4, 64, 0);
si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
sctx->esgs_ring, 0, esgs_ring_size,
- false, false, 0, 0);
+ false, false, 0, 0, 0);
si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
sctx->gsvs_ring, 0, gsvs_ring_size,
- false, false, 0, 0);
+ false, false, 0, 0, 0);
}
+static void si_update_gs_rings(struct si_context *sctx)
+{
+ unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16;
+ unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices;
+ unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+ uint64_t offset;
+
+ si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
+ sctx->gsvs_ring, gsvs_itemsize,
+ 64, true, true, 4, 16, 0);
+
+ offset = gsvs_itemsize * 64;
+ si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1,
+ sctx->gsvs_ring, gsvs_itemsize,
+ 64, true, true, 4, 16, offset);
+
+ offset = (gsvs_itemsize * 2) * 64;
+ si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2,
+ sctx->gsvs_ring, gsvs_itemsize,
+ 64, true, true, 4, 16, offset);
+
+ offset = (gsvs_itemsize * 3) * 64;
+ si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3,
+ sctx->gsvs_ring, gsvs_itemsize,
+ 64, true, true, 4, 16, offset);
+
+}
/**
* @returns 1 if \p sel has been updated to use a new scratch buffer and 0
* otherwise.
si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL,
SI_RING_TESS_FACTOR, sctx->tf_ring, 0,
- sctx->tf_ring->width0, false, false, 0, 0);
+ sctx->tf_ring->width0, false, false, 0, 0, 0);
sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
}
int i;
for (i = 0; i < so->num_outputs; i++)
- enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer);
+ enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4);
sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask;
sctx->b.streamout.stride_in_dw = shader->so.stride;
}
if (!sctx->gs_rings)
si_init_gs_rings(sctx);
+
if (sctx->emitted.named.gs_rings != sctx->gs_rings)
sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
si_pm4_bind_state(sctx, gs_rings, sctx->gs_rings);
- si_set_ring_buffer(ctx, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
- sctx->gsvs_ring,
- sctx->gs_shader->gs_max_out_vertices *
- sctx->gs_shader->info.num_outputs * 16,
- 64, true, true, 4, 16);
+ si_update_gs_rings(sctx);
} else {
si_pm4_bind_state(sctx, gs_rings, NULL);
si_pm4_bind_state(sctx, gs, NULL);