* Return the IR binary in a buffer. For TGSI the first 4 bytes contain its
* size as integer.
*/
-static void *si_get_ir_binary(struct si_shader_selector *sel)
+void *si_get_ir_binary(struct si_shader_selector *sel)
{
struct blob blob;
unsigned ir_size;
*
* Returns false on failure, in which case the ir_binary should be freed.
*/
-static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
- void *ir_binary,
- struct si_shader *shader,
- bool insert_into_disk_cache)
+bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
+ struct si_shader *shader,
+ bool insert_into_disk_cache)
{
void *hw_binary;
struct hash_entry *entry;
return true;
}
-static bool si_shader_cache_load_shader(struct si_screen *sscreen,
- void *ir_binary,
- struct si_shader *shader)
+bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
+ struct si_shader *shader)
{
struct hash_entry *entry =
_mesa_hash_table_search(sscreen->shader_cache, ir_binary);
polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
}
+static unsigned si_conv_prim_to_gs_out(unsigned mode)
+{
+ static const int prim_conv[] = {
+ [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+ [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+ [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+ [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST,
+ };
+ assert(mode < ARRAY_SIZE(prim_conv));
+
+ return prim_conv[mode];
+}
+
struct gfx9_gs_info {
unsigned es_verts_per_subgroup;
unsigned gs_prims_per_subgroup;
if (max_stream >= 2)
offset += num_components[2] * sel->gs_max_out_vertices;
si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
+ si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
+ si_conv_prim_to_gs_out(sel->gs_output_prim));
if (max_stream >= 3)
offset += num_components[3] * sel->gs_max_out_vertices;
si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
}
/* Find out which VS outputs aren't used by the PS. */
- uint64_t outputs_written = vs->outputs_written;
+ uint64_t outputs_written = vs->outputs_written_before_ps;
uint64_t inputs_read = 0;
- /* ignore POSITION, PSIZE */
- outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0) |
- (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0))));
+ /* Ignore outputs that are not passed from VS to PS. */
+ outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
+ (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
+ (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
if (!ps_disabled) {
inputs_read = ps->inputs_read;
key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
}
- if (rs) {
- bool is_poly = (sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES &&
- sctx->current_rast_prim <= PIPE_PRIM_POLYGON) ||
- sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
- bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
+ bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+ bool is_line = util_prim_is_lines(sctx->current_rast_prim);
- key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
- key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
+ key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+ key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
- if (sctx->queued.named.blend) {
- key->part.ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
- rs->multisample_enable;
- }
+ if (sctx->queued.named.blend) {
+ key->part.ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
+ rs->multisample_enable;
+ }
- key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
- key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
- (is_line && rs->line_smooth)) &&
- sctx->framebuffer.nr_samples <= 1;
- key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+ key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+ key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
+ (is_line && rs->line_smooth)) &&
+ sctx->framebuffer.nr_samples <= 1;
+ key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
- if (sctx->ps_iter_samples > 1 &&
- sel->info.reads_samplemask) {
- key->part.ps.prolog.samplemask_log_ps_iter =
- util_logbase2(sctx->ps_iter_samples);
- }
+ if (sctx->ps_iter_samples > 1 &&
+ sel->info.reads_samplemask) {
+ key->part.ps.prolog.samplemask_log_ps_iter =
+ util_logbase2(sctx->ps_iter_samples);
+ }
- if (rs->force_persample_interp &&
- rs->multisample_enable &&
- sctx->framebuffer.nr_samples > 1 &&
- sctx->ps_iter_samples > 1) {
- key->part.ps.prolog.force_persp_sample_interp =
- sel->info.uses_persp_center ||
- sel->info.uses_persp_centroid;
-
- key->part.ps.prolog.force_linear_sample_interp =
- sel->info.uses_linear_center ||
- sel->info.uses_linear_centroid;
- } else if (rs->multisample_enable &&
- sctx->framebuffer.nr_samples > 1) {
- key->part.ps.prolog.bc_optimize_for_persp =
- sel->info.uses_persp_center &&
- sel->info.uses_persp_centroid;
- key->part.ps.prolog.bc_optimize_for_linear =
- sel->info.uses_linear_center &&
- sel->info.uses_linear_centroid;
- } else {
- /* Make sure SPI doesn't compute more than 1 pair
- * of (i,j), which is the optimization here. */
- key->part.ps.prolog.force_persp_center_interp =
- sel->info.uses_persp_center +
- sel->info.uses_persp_centroid +
- sel->info.uses_persp_sample > 1;
-
- key->part.ps.prolog.force_linear_center_interp =
- sel->info.uses_linear_center +
- sel->info.uses_linear_centroid +
- sel->info.uses_linear_sample > 1;
-
- if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE])
- key->mono.u.ps.interpolate_at_sample_force_center = 1;
- }
+ if (rs->force_persample_interp &&
+ rs->multisample_enable &&
+ sctx->framebuffer.nr_samples > 1 &&
+ sctx->ps_iter_samples > 1) {
+ key->part.ps.prolog.force_persp_sample_interp =
+ sel->info.uses_persp_center ||
+ sel->info.uses_persp_centroid;
+
+ key->part.ps.prolog.force_linear_sample_interp =
+ sel->info.uses_linear_center ||
+ sel->info.uses_linear_centroid;
+ } else if (rs->multisample_enable &&
+ sctx->framebuffer.nr_samples > 1) {
+ key->part.ps.prolog.bc_optimize_for_persp =
+ sel->info.uses_persp_center &&
+ sel->info.uses_persp_centroid;
+ key->part.ps.prolog.bc_optimize_for_linear =
+ sel->info.uses_linear_center &&
+ sel->info.uses_linear_centroid;
+ } else {
+ /* Make sure SPI doesn't compute more than 1 pair
+ * of (i,j), which is the optimization here. */
+ key->part.ps.prolog.force_persp_center_interp =
+ sel->info.uses_persp_center +
+ sel->info.uses_persp_centroid +
+ sel->info.uses_persp_sample > 1;
+
+ key->part.ps.prolog.force_linear_center_interp =
+ sel->info.uses_linear_center +
+ sel->info.uses_linear_centroid +
+ sel->info.uses_linear_sample > 1;
+
+ if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE])
+ key->mono.u.ps.interpolate_at_sample_force_center = 1;
}
key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
{
struct si_shader_selector *sel = shader->selector;
struct si_screen *sscreen = sel->screen;
- struct si_compiler *compiler;
+ struct ac_llvm_compiler *compiler;
struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
int r;
main_part->selector = sel;
main_part->key.as_es = key->as_es;
main_part->key.as_ls = key->as_ls;
+ main_part->is_monolithic = false;
if (si_compile_tgsi_shader(sscreen, compiler_state->compiler,
- main_part, false,
- &compiler_state->debug) != 0) {
+ main_part, &compiler_state->debug) != 0) {
FREE(main_part);
return false;
}
{
struct si_shader_selector *sel = (struct si_shader_selector *)job;
struct si_screen *sscreen = sel->screen;
- struct si_compiler *compiler;
+ struct ac_llvm_compiler *compiler;
struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
assert(!debug->debug_message || debug->async);
util_queue_fence_init(&shader->ready);
shader->selector = sel;
+ shader->is_monolithic = false;
si_parse_next_shader_property(&sel->info,
sel->so.num_outputs != 0,
&shader->key);
mtx_unlock(&sscreen->shader_cache_mutex);
/* Compile the shader if it hasn't been loaded from the cache. */
- if (si_compile_tgsi_shader(sscreen, compiler, shader, false,
+ if (si_compile_tgsi_shader(sscreen, compiler, shader,
debug) != 0) {
FREE(shader);
FREE(ir_binary);
break;
/* fall through */
default:
- id = si_shader_io_get_unique_index(name, index);
- sel->outputs_written &= ~(1ull << id);
+ id = si_shader_io_get_unique_index(name, index, true);
+ sel->outputs_written_before_ps &= ~(1ull << id);
break;
case TGSI_SEMANTIC_POSITION: /* ignore these */
case TGSI_SEMANTIC_PSIZE:
}
}
+void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
+ struct util_queue_fence *ready_fence,
+ struct si_compiler_ctx_state *compiler_ctx_state,
+ void *job, util_queue_execute_func execute)
+{
+ util_queue_fence_init(ready_fence);
+
+ struct util_async_debug_callback async_debug;
+ bool wait =
+ (sctx->debug.debug_message && !sctx->debug.async) ||
+ sctx->is_debug ||
+ si_can_dump_shader(sctx->screen, processor);
+
+ if (wait) {
+ u_async_debug_init(&async_debug);
+ compiler_ctx_state->debug = async_debug.base;
+ }
+
+ util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
+ ready_fence, execute, NULL);
+
+ if (wait) {
+ util_queue_fence_wait(ready_fence);
+ u_async_debug_drain(&async_debug, &sctx->debug);
+ u_async_debug_cleanup(&async_debug);
+ }
+}
+
/* Return descriptor slot usage masks from the given shader info. */
void si_get_active_slot_masks(const struct tgsi_shader_info *info,
uint32_t *const_and_shader_buffers,
/* fall through */
default:
sel->outputs_written |=
- 1ull << si_shader_io_get_unique_index(name, index);
+ 1ull << si_shader_io_get_unique_index(name, index, false);
+ sel->outputs_written_before_ps |=
+ 1ull << si_shader_io_get_unique_index(name, index, true);
break;
- case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */
case TGSI_SEMANTIC_EDGEFLAG:
break;
}
*/
if (sctx->chip_class >= GFX9)
sel->esgs_itemsize += 4;
+
+ assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
break;
case PIPE_SHADER_FRAGMENT:
/* fall through */
default:
sel->inputs_read |=
- 1ull << si_shader_io_get_unique_index(name, index);
+ 1ull << si_shader_io_get_unique_index(name, index, true);
break;
case TGSI_SEMANTIC_PCOORD: /* ignore this */
break;
}
(void) mtx_init(&sel->mutex, mtx_plain);
- util_queue_fence_init(&sel->ready);
-
- struct util_async_debug_callback async_debug;
- bool wait =
- (sctx->debug.debug_message && !sctx->debug.async) ||
- sctx->is_debug ||
- si_can_dump_shader(sscreen, sel->info.processor);
-
- if (wait) {
- u_async_debug_init(&async_debug);
- sel->compiler_ctx_state.debug = async_debug.base;
- }
-
- util_queue_add_job(&sscreen->shader_compiler_queue, sel,
- &sel->ready, si_init_shader_selector_async,
- NULL);
-
- if (wait) {
- util_queue_fence_wait(&sel->ready);
- u_async_debug_drain(&async_debug, &sctx->debug);
- u_async_debug_cleanup(&async_debug);
- }
+ si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
+ &sel->compiler_ctx_state, sel,
+ si_init_shader_selector_async);
return sel;
}
static void si_emit_spi_map(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->gfx_cs;
struct si_shader *ps = sctx->ps_shader.current;
struct si_shader *vs = si_get_vs_state(sctx);
struct tgsi_shader_info *psinfo = ps ? &ps->selector->info : NULL;
unsigned i, num_interp, num_written = 0, bcol_interp[2];
+ unsigned spi_ps_input_cntl[32];
if (!ps || !ps->selector->info.num_inputs)
return;
num_interp = si_get_ps_num_interp(ps);
assert(num_interp > 0);
- radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, num_interp);
for (i = 0; i < psinfo->num_inputs; i++) {
unsigned name = psinfo->input_semantic_name[i];
unsigned index = psinfo->input_semantic_index[i];
unsigned interpolate = psinfo->input_interpolate[i];
- radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, name, index,
- interpolate));
- num_written++;
+ spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name,
+ index, interpolate);
if (name == TGSI_SEMANTIC_COLOR) {
assert(index < ARRAY_SIZE(bcol_interp));
if (!(psinfo->colors_read & (0xf << (i * 4))))
continue;
- radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, bcol,
- i, bcol_interp[i]));
- num_written++;
+ spi_ps_input_cntl[num_written++] =
+ si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
+
}
}
assert(num_interp == num_written);
+
+ /* R_028644_SPI_PS_INPUT_CNTL_0 */
+ /* Dota 2: Only ~16% of SPI map updates set different values. */
+ /* Talos: Only ~9% of SPI map updates set different values. */
+ radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
+ spi_ps_input_cntl,
+ sctx->tracked_regs.spi_ps_input_cntl, num_interp);
}
/**
static void si_emit_scratch_state(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->gfx_cs;
+ struct radeon_cmdbuf *cs = sctx->gfx_cs;
radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
sctx->spi_tmpring_size);