/* SHADER STATES */
static void si_set_tesseval_regs(struct si_screen *sscreen,
- struct si_shader *shader,
+ struct si_shader_selector *tes,
struct si_pm4_state *pm4)
{
- struct tgsi_shader_info *info = &shader->selector->info;
+ struct tgsi_shader_info *info = &tes->info;
unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
* VS as ES | ES -> GS -> VS | 30
* TES as VS | LS -> HS -> VS | 14 or 30
* TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30
+ *
+ * If "shader" is NULL, it's assumed it's not LS or GS copy shader.
*/
static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen,
+ struct si_shader_selector *sel,
struct si_shader *shader,
struct si_pm4_state *pm4)
{
- unsigned type = shader->selector->type;
+ unsigned type = sel->type;
if (sscreen->b.family < CHIP_POLARIS10)
return;
/* VS as VS, or VS as ES: */
if ((type == PIPE_SHADER_VERTEX &&
- !shader->key.as_ls &&
- !shader->is_gs_copy_shader) ||
+ (!shader ||
+ (!shader->key.as_ls && !shader->is_gs_copy_shader))) ||
/* TES as VS, or TES as ES: */
type == PIPE_SHADER_TESS_EVAL) {
unsigned vtx_reuse_depth = 30;
if (type == PIPE_SHADER_TESS_EVAL &&
- shader->selector->info.properties[TGSI_PROPERTY_TES_SPACING] ==
+ sel->info.properties[TGSI_PROPERTY_TES_SPACING] ==
PIPE_TESS_SPACING_FRACTIONAL_ODD)
vtx_reuse_depth = 14;
si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
/* We need at least 2 components for LS.
- * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
- vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
+ * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+ * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+ */
+ vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, va >> 40);
/* We need at least 2 components for LS.
- * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
- ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
+ * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
+ * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+ */
+ ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1;
+
+ if (shader->config.scratch_bytes_per_wave) {
+ fprintf(stderr, "HS: scratch buffer unsupported");
+ abort();
+ }
shader->config.rsrc2 =
S_00B42C_USER_SGPR(GFX9_TCS_NUM_USER_SGPR) |
si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
if (shader->selector->type == PIPE_SHADER_VERTEX) {
- vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
+ /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+ vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
num_user_sgprs = SI_VS_NUM_USER_SGPR;
} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
- vgpr_comp_cnt = 3; /* all components are needed for TES */
+ vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
num_user_sgprs = SI_TES_NUM_USER_SGPR;
} else
unreachable("invalid shader selector type");
S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader, pm4);
+ si_set_tesseval_regs(sscreen, shader->selector, pm4);
- polaris_set_vgt_vertex_reuse(sscreen, shader, pm4);
+ polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
}
/**
struct gfx9_gs_info gs_info;
if (es_type == PIPE_SHADER_VERTEX)
- es_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
+ /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
+ es_vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0;
else if (es_type == PIPE_SHADER_TESS_EVAL)
- es_vgpr_comp_cnt = 3; /* all components are needed for TES */
+ es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2;
else
unreachable("invalid shader selector type");
S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup));
si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
shader->key.part.gs.es->esgs_itemsize / 4);
+
+ if (es_type == PIPE_SHADER_TESS_EVAL)
+ si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
+
+ polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es,
+ NULL, pm4);
+
+ if (shader->config.scratch_bytes_per_wave) {
+ fprintf(stderr, "GS: scratch buffer unsupported");
+ abort();
+ }
} else {
si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
unsigned oc_lds_en;
unsigned window_space =
shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
- bool enable_prim_id = si_vs_exports_prim_id(shader);
+ bool enable_prim_id = shader->key.mono.vs_export_prim_id;
pm4 = si_get_shader_pm4_state(shader);
if (!pm4)
* not sent again.
*/
if (!gs) {
- si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
- S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
+ unsigned mode = 0;
+
+ /* PrimID needs GS scenario A.
+ * GFX9 also needs it when ViewportIndex is enabled.
+ */
+ if (enable_prim_id ||
+ (sscreen->b.chip_class >= GFX9 &&
+ shader->selector->info.writes_viewport_index))
+ mode = V_028A40_GS_SCENARIO_A;
+
+ si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, S_028A40_MODE(mode));
si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
} else {
si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(gs));
vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
- vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
+ /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
+ * If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
+ * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
+ */
+ vgpr_comp_cnt = enable_prim_id ? 2 : (shader->info.uses_instanceid ? 1 : 0);
num_user_sgprs = SI_VS_NUM_USER_SGPR;
} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
- vgpr_comp_cnt = 3; /* all components are needed for TES */
+ vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
num_user_sgprs = SI_TES_NUM_USER_SGPR;
} else
unreachable("invalid shader selector type");
S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
- si_set_tesseval_regs(sscreen, shader, pm4);
+ si_set_tesseval_regs(sscreen, shader->selector, pm4);
- polaris_set_vgt_vertex_reuse(sscreen, shader, pm4);
+ polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
}
static unsigned si_get_ps_num_interp(struct si_shader *ps)
si_shader_selector_key_hw_vs(sctx, sel, key);
if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
- key->part.vs.epilog.export_prim_id = 1;
+ key->mono.vs_export_prim_id = 1;
}
break;
case PIPE_SHADER_TESS_CTRL:
si_shader_selector_key_hw_vs(sctx, sel, key);
if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
- key->part.tes.epilog.export_prim_id = 1;
+ key->mono.vs_export_prim_id = 1;
}
break;
case PIPE_SHADER_GEOMETRY:
key, &key->part.gs.vs_prolog);
key->part.gs.es = sctx->vs_shader.cso;
}
+
+ /* Merged ES-GS can have unbalanced wave usage.
+ *
+ * ES threads are per-vertex, while GS threads are
+ * per-primitive. So without any amplification, there
+ * are fewer GS threads than ES threads, which can result
+ * in empty (no-op) GS waves. With too much amplification,
+ * there are more GS threads than ES threads, which
+ * can result in empty (no-op) ES waves.
+ *
+ * Non-monolithic shaders are implemented by setting EXEC
+ * at the beginning of shader parts, and don't jump to
+ * the end if EXEC is 0.
+ *
+ * Monolithic shaders use conditional blocks, so they can
+ * jump and skip empty waves of ES or GS. So set this to
+ * always use optimized variants, which are monolithic.
+ */
+ key->opt.prefer_mono = 1;
}
key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
break;
return true;
}
+static void si_destroy_shader_selector(struct si_context *sctx,
+ struct si_shader_selector *sel);
+
+static void si_shader_selector_reference(struct si_context *sctx,
+ struct si_shader_selector **dst,
+ struct si_shader_selector *src)
+{
+ if (pipe_reference(&(*dst)->reference, &src->reference))
+ si_destroy_shader_selector(sctx, *dst);
+
+ *dst = src;
+}
+
/* Select the hw shader variant depending on the current state. */
static int si_shader_select_with_key(struct si_screen *sscreen,
struct si_shader_ctx_state *state,
int thread_index)
{
struct si_shader_selector *sel = state->cso;
+ struct si_shader_selector *previous_stage_sel = NULL;
struct si_shader *current = state->current;
struct si_shader *iter, *shader = NULL;
shader->key = *key;
shader->compiler_ctx_state = *compiler_state;
+ /* If this is a merged shader, get the first shader's selector. */
+ if (sscreen->b.chip_class >= GFX9) {
+ if (sel->type == PIPE_SHADER_TESS_CTRL)
+ previous_stage_sel = key->part.tcs.ls;
+ else if (sel->type == PIPE_SHADER_GEOMETRY)
+ previous_stage_sel = key->part.gs.es;
+ }
+
/* Compile the main shader part if it doesn't exist. This can happen
* if the initial guess was wrong. */
bool is_pure_monolithic =
* For merged shaders, check that the starting shader's main
* part is present.
*/
- if (sscreen->b.chip_class >= GFX9 &&
- (sel->type == PIPE_SHADER_TESS_CTRL ||
- sel->type == PIPE_SHADER_GEOMETRY)) {
- struct si_shader_selector *shader1 = NULL;
+ if (previous_stage_sel) {
struct si_shader_key shader1_key = zeroed;
- if (sel->type == PIPE_SHADER_TESS_CTRL) {
- shader1 = key->part.tcs.ls;
+ if (sel->type == PIPE_SHADER_TESS_CTRL)
shader1_key.as_ls = 1;
- } else if (sel->type == PIPE_SHADER_GEOMETRY) {
- shader1 = key->part.gs.es;
+ else if (sel->type == PIPE_SHADER_GEOMETRY)
shader1_key.as_es = 1;
- } else
+ else
assert(0);
- ok = si_check_missing_main_part(sscreen, shader1,
+ ok = si_check_missing_main_part(sscreen,
+ previous_stage_sel,
compiler_state, &shader1_key);
} else {
ok = si_check_missing_main_part(sscreen, sel,
}
}
+ /* Keep the reference to the 1st shader of merged shaders, so that
+ * Gallium can't destroy it before we destroy the 2nd shader.
+ *
+ * Set sctx = NULL, because it's unused if we're not releasing
+ * the shader, and we don't have any sctx here.
+ */
+ si_shader_selector_reference(NULL, &shader->previous_stage_sel,
+ previous_stage_sel);
+
/* Monolithic-only shaders don't make a distinction between optimized
* and unoptimized. */
shader->is_monolithic =
if (!sel)
return NULL;
+ pipe_reference_init(&sel->reference, 1);
sel->screen = sscreen;
sel->compiler_ctx_state.tm = sctx->tm;
sel->compiler_ctx_state.debug = sctx->b.debug;
}
}
+ si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
si_shader_destroy(shader);
free(shader);
}
-static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
+static void si_destroy_shader_selector(struct si_context *sctx,
+ struct si_shader_selector *sel)
{
- struct si_context *sctx = (struct si_context *)ctx;
- struct si_shader_selector *sel = (struct si_shader_selector *)state;
struct si_shader *p = sel->first_variant, *c;
struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
[PIPE_SHADER_VERTEX] = &sctx->vs_shader,
free(sel);
}
+static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
+{
+ struct si_context *sctx = (struct si_context *)ctx;
+ struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+ si_shader_selector_reference(sctx, &sel, NULL);
+}
+
static unsigned si_get_ps_input_cntl(struct si_context *sctx,
struct si_shader *vs, unsigned name,
unsigned index, unsigned interpolate)
assert(sctx->scratch_buffer);
- si_shader_apply_scratch_relocs(sctx, shader, &shader->config, scratch_va);
+ si_shader_apply_scratch_relocs(shader, scratch_va);
/* Replace the shader bo with a new bo that has the relocs applied. */
r = si_shader_binary_upload(sctx->screen, shader);
}
assert(!sctx->tf_ring);
+ /* Use 64K alignment for both rings, so that we can pass the address
+ * to shaders as one SGPR containing bits [16:47].
+ */
sctx->tf_ring = r600_aligned_buffer_create(sctx->b.b.screen,
R600_RESOURCE_FLAG_UNMAPPABLE,
PIPE_USAGE_DEFAULT,
32768 * sctx->screen->b.info.max_se,
- 256);
+ 64 * 1024);
if (!sctx->tf_ring)
return;
PIPE_USAGE_DEFAULT,
max_offchip_buffers *
sctx->screen->tess_offchip_block_dw_size * 4,
- 256);
+ 64 * 1024);
if (!sctx->tess_offchip_ring)
return;
si_init_config_add_vgt_flush(sctx);
+ uint64_t offchip_va = r600_resource(sctx->tess_offchip_ring)->gpu_address;
+ uint64_t factor_va = r600_resource(sctx->tf_ring)->gpu_address;
+ assert((offchip_va & 0xffff) == 0);
+ assert((factor_va & 0xffff) == 0);
+
+ si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tess_offchip_ring),
+ RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
+ si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tf_ring),
+ RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
+
/* Append these registers to the init config state. */
if (sctx->b.chip_class >= CIK) {
if (sctx->b.chip_class >= VI)
si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
S_030938_SIZE(sctx->tf_ring->width0 / 4));
si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
- r600_resource(sctx->tf_ring)->gpu_address >> 8);
+ factor_va >> 8);
if (sctx->b.chip_class >= GFX9)
si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI,
- r600_resource(sctx->tf_ring)->gpu_address >> 40);
+ factor_va >> 40);
si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM,
S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
S_03093C_OFFCHIP_GRANULARITY(offchip_granularity));
si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
S_008988_SIZE(sctx->tf_ring->width0 / 4));
si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
- r600_resource(sctx->tf_ring)->gpu_address >> 8);
+ factor_va >> 8);
si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers));
}
+ if (sctx->b.chip_class >= GFX9) {
+ si_pm4_set_reg(sctx->init_config,
+ R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+ GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4,
+ offchip_va >> 16);
+ si_pm4_set_reg(sctx->init_config,
+ R_00B430_SPI_SHADER_USER_DATA_LS_0 +
+ GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K * 4,
+ factor_va >> 16);
+ } else {
+ si_pm4_set_reg(sctx->init_config,
+ R_00B430_SPI_SHADER_USER_DATA_HS_0 +
+ GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4,
+ offchip_va >> 16);
+ si_pm4_set_reg(sctx->init_config,
+ R_00B430_SPI_SHADER_USER_DATA_HS_0 +
+ GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K * 4,
+ factor_va >> 16);
+ }
+
/* Flush the context to re-emit the init_config state.
* This is done only once in a lifetime of a context.
*/
si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
sctx->b.initial_gfx_cs_size = 0; /* force flush */
si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
-
- si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_FACTOR, sctx->tf_ring,
- 0, sctx->tf_ring->width0, false, false, 0, 0, 0);
-
- si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_OFFCHIP,
- sctx->tess_offchip_ring, 0,
- sctx->tess_offchip_ring->width0, false, false, 0, 0, 0);
}
/**