X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state_shaders.c;h=ecb8a0dad8531125b044fb994fd18c8d87b8a5ff;hb=f0d74ecce8d3353ed2696cb4b1e707fd6ddf0a40;hp=6b9107785362f00270ce5be82610b3c0eb1774a8;hpb=3b2e93e472d185a5df5ec3c3d23c8744a0c23e42;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 6b910778536..ecb8a0dad85 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -19,10 +19,6 @@ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * Authors: - * Christian König - * Marek Olšák */ #include "si_pipe.h" @@ -34,12 +30,14 @@ #include "tgsi/tgsi_ureg.h" #include "util/hash_table.h" #include "util/crc32.h" +#include "util/u_async_debug.h" #include "util/u_memory.h" #include "util/u_prim.h" #include "util/disk_cache.h" #include "util/mesa-sha1.h" #include "ac_exp_param.h" +#include "ac_shader_util.h" /* SHADER_CACHE */ @@ -209,11 +207,11 @@ static bool si_shader_cache_insert_shader(struct si_screen *sscreen, return false; } - if (sscreen->b.disk_shader_cache && insert_into_disk_cache) { - disk_cache_compute_key(sscreen->b.disk_shader_cache, tgsi_binary, + if (sscreen->disk_shader_cache && insert_into_disk_cache) { + disk_cache_compute_key(sscreen->disk_shader_cache, tgsi_binary, *((uint32_t *)tgsi_binary), key); - disk_cache_put(sscreen->b.disk_shader_cache, key, hw_binary, - *((uint32_t *) hw_binary)); + disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, + *((uint32_t *) hw_binary), NULL); } return true; @@ -226,16 +224,16 @@ static bool si_shader_cache_load_shader(struct si_screen *sscreen, struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, tgsi_binary); if (!entry) { - if (sscreen->b.disk_shader_cache) { + if (sscreen->disk_shader_cache) { unsigned char sha1[CACHE_KEY_SIZE]; size_t tg_size = *((uint32_t *) tgsi_binary); - disk_cache_compute_key(sscreen->b.disk_shader_cache, + disk_cache_compute_key(sscreen->disk_shader_cache, tgsi_binary, tg_size, sha1); size_t binary_size; uint8_t *buffer = - disk_cache_get(sscreen->b.disk_shader_cache, + disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size); if (!buffer) return false; @@ -249,7 +247,7 @@ static bool si_shader_cache_load_shader(struct si_screen *sscreen, assert(!"Invalid radeonsi shader disk cache " "item!"); - disk_cache_remove(sscreen->b.disk_shader_cache, + disk_cache_remove(sscreen->disk_shader_cache, sha1); free(buffer); @@ -274,7 +272,7 @@ static bool si_shader_cache_load_shader(struct si_screen *sscreen, else return false; } - p_atomic_inc(&sscreen->b.num_shader_cache_hits); + p_atomic_inc(&sscreen->num_shader_cache_hits); return true; } @@ -375,8 +373,8 @@ static void si_set_tesseval_regs(struct si_screen *sscreen, topology = V_028B6C_OUTPUT_TRIANGLE_CW; if (sscreen->has_distributed_tess) { - if (sscreen->b.family == CHIP_FIJI || - sscreen->b.family >= CHIP_POLARIS10) + if (sscreen->info.family == CHIP_FIJI || + sscreen->info.family >= CHIP_POLARIS10) distribution_mode = V_028B6C_DISTRIBUTION_MODE_TRAPEZOIDS; else distribution_mode = V_028B6C_DISTRIBUTION_MODE_DONUTS; @@ -411,7 +409,7 @@ static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, { unsigned type = sel->type; - if (sscreen->b.family < CHIP_POLARIS10) + if (sscreen->info.family < CHIP_POLARIS10) return; /* VS as VS, or VS as ES: */ @@ -448,7 +446,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader) unsigned vgpr_comp_cnt; uint64_t va; - assert(sscreen->b.chip_class <= VI); + assert(sscreen->info.chip_class <= VI); pm4 = si_get_shader_pm4_state(shader); if (!pm4) @@ -488,7 +486,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - if (sscreen->b.chip_class >= GFX9) { + if (sscreen->info.chip_class >= GFX9) { si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8); si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, va >> 40); @@ -498,11 +496,6 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) */ ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1; - if (shader->config.scratch_bytes_per_wave) { - fprintf(stderr, "HS: scratch buffer unsupported"); - abort(); - } - shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX9_TCS_NUM_USER_SGPR) | S_00B42C_USER_SGPR_MSB(GFX9_TCS_NUM_USER_SGPR >> 5) | @@ -524,7 +517,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) S_00B428_FLOAT_MODE(shader->config.float_mode) | S_00B428_LS_VGPR_COMP_CNT(ls_vgpr_comp_cnt)); - if (sscreen->b.chip_class <= VI) { + if (sscreen->info.chip_class <= VI) { si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2); } @@ -538,7 +531,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) uint64_t va; unsigned oc_lds_en; - assert(sscreen->b.chip_class <= VI); + assert(sscreen->info.chip_class <= VI); pm4 = si_get_shader_pm4_state(shader); if (!pm4) @@ -580,34 +573,6 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4); } -/** - * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a - * geometry shader. - */ -static uint32_t si_vgt_gs_mode(struct si_shader_selector *sel) -{ - enum chip_class chip_class = sel->screen->b.chip_class; - unsigned gs_max_vert_out = sel->gs_max_out_vertices; - unsigned cut_mode; - - if (gs_max_vert_out <= 128) { - cut_mode = V_028A40_GS_CUT_128; - } else if (gs_max_vert_out <= 256) { - cut_mode = V_028A40_GS_CUT_256; - } else if (gs_max_vert_out <= 512) { - cut_mode = V_028A40_GS_CUT_512; - } else { - assert(gs_max_vert_out <= 1024); - cut_mode = V_028A40_GS_CUT_1024; - } - - return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | - S_028A40_CUT_MODE(cut_mode)| - S_028A40_ES_WRITE_OPTIMIZE(chip_class <= VI) | - S_028A40_GS_WRITE_OPTIMIZE(1) | - S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0); -} - struct gfx9_gs_info { unsigned es_verts_per_subgroup; unsigned gs_prims_per_subgroup; @@ -649,9 +614,11 @@ static void gfx9_get_gs_info(struct si_shader_selector *es, /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations. * Make sure we don't go over the maximum value. */ - max_gs_prims = MIN2(max_gs_prims, - max_out_prims / - (gs->gs_max_out_vertices * gs_num_invocations)); + if (gs->gs_max_out_vertices > 0) { + max_gs_prims = MIN2(max_gs_prims, + max_out_prims / + (gs->gs_max_out_vertices * gs_num_invocations)); + } assert(max_gs_prims > 0); /* If the primitive has adjacency, halve the number of vertices @@ -757,7 +724,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); - if (sscreen->b.chip_class >= GFX9) { + if (sscreen->info.chip_class >= GFX9) { unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; unsigned es_type = shader->key.part.gs.es->type; unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt; @@ -816,11 +783,6 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4); - - if (shader->config.scratch_bytes_per_wave) { - fprintf(stderr, "GS: scratch buffer unsupported"); - abort(); - } } else { si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40); @@ -846,14 +808,15 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, struct si_shader_selector *gs) { + const struct tgsi_shader_info *info = &shader->selector->info; struct si_pm4_state *pm4; unsigned num_user_sgprs; unsigned nparams, vgpr_comp_cnt; uint64_t va; unsigned oc_lds_en; unsigned window_space = - shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; - bool enable_prim_id = si_vs_exports_prim_id(shader); + info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + bool enable_prim_id = shader->key.mono.u.vs_export_prim_id || info->uses_primid; pm4 = si_get_shader_pm4_state(shader); if (!pm4) @@ -867,14 +830,27 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, * not sent again. */ if (!gs) { - si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, - S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0)); + unsigned mode = V_028A40_GS_OFF; + + /* PrimID needs GS scenario A. */ + if (enable_prim_id) + mode = V_028A40_GS_SCENARIO_A; + + si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, S_028A40_MODE(mode)); si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id); } else { - si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(gs)); + si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, + ac_vgt_gs_mode(gs->gs_max_out_vertices, + sscreen->info.chip_class)); si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0); } + if (sscreen->info.chip_class <= VI) { + /* Reuse needs to be set off if we write oViewport. */ + si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, + S_028AB4_REUSE_OFF(info->writes_viewport_index)); + } + va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); @@ -887,9 +863,15 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. */ vgpr_comp_cnt = enable_prim_id ? 2 : (shader->info.uses_instanceid ? 1 : 0); - num_user_sgprs = SI_VS_NUM_USER_SGPR; + + if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]) { + num_user_sgprs = SI_SGPR_VS_BLIT_DATA + + info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; + } else { + num_user_sgprs = SI_VS_NUM_USER_SGPR; + } } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { - vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2; + vgpr_comp_cnt = enable_prim_id ? 3 : 2; num_user_sgprs = SI_TES_NUM_USER_SGPR; } else unreachable("invalid shader selector type"); @@ -973,38 +955,6 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) return value; } -static unsigned si_get_cb_shader_mask(unsigned spi_shader_col_format) -{ - unsigned i, cb_shader_mask = 0; - - for (i = 0; i < 8; i++) { - switch ((spi_shader_col_format >> (i * 4)) & 0xf) { - case V_028714_SPI_SHADER_ZERO: - break; - case V_028714_SPI_SHADER_32_R: - cb_shader_mask |= 0x1 << (i * 4); - break; - case V_028714_SPI_SHADER_32_GR: - cb_shader_mask |= 0x3 << (i * 4); - break; - case V_028714_SPI_SHADER_32_AR: - cb_shader_mask |= 0x9 << (i * 4); - break; - case V_028714_SPI_SHADER_FP16_ABGR: - case V_028714_SPI_SHADER_UNORM16_ABGR: - case V_028714_SPI_SHADER_SNORM16_ABGR: - case V_028714_SPI_SHADER_UINT16_ABGR: - case V_028714_SPI_SHADER_SINT16_ABGR: - case V_028714_SPI_SHADER_32_ABGR: - cb_shader_mask |= 0xf << (i * 4); - break; - default: - assert(0); - } - } - return cb_shader_mask; -} - static void si_shader_ps(struct si_shader *shader) { struct tgsi_shader_info *info = &shader->selector->info; @@ -1087,7 +1037,7 @@ static void si_shader_ps(struct si_shader *shader) spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1); spi_shader_col_format = si_get_spi_shader_col_format(shader); - cb_shader_mask = si_get_cb_shader_mask(spi_shader_col_format); + cb_shader_mask = ac_get_cb_shader_mask(spi_shader_col_format); /* Ensure that some export memory is always allocated, for two reasons: * @@ -1116,7 +1066,7 @@ static void si_shader_ps(struct si_shader *shader) si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control); si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT, - si_get_spi_shader_z_format(info->writes_z, + ac_get_spi_shader_z_format(info->writes_z, info->writes_stencil, info->writes_samplemask)); @@ -1188,13 +1138,18 @@ static void si_shader_selector_key_vs(struct si_context *sctx, if (!sctx->vertex_elements) return; + prolog_key->instance_divisor_is_one = + sctx->vertex_elements->instance_divisor_is_one; + prolog_key->instance_divisor_is_fetched = + sctx->vertex_elements->instance_divisor_is_fetched; + + /* Prefer a monolithic shader to allow scheduling divisions around + * VBO loads. */ + if (prolog_key->instance_divisor_is_fetched) + key->opt.prefer_mono = 1; + unsigned count = MIN2(vs->info.num_inputs, sctx->vertex_elements->count); - for (unsigned i = 0; i < count; ++i) { - prolog_key->instance_divisors[i] = - sctx->vertex_elements->elements[i].instance_divisor; - } - memcpy(key->mono.vs_fix_fetch, sctx->vertex_elements->fix_fetch, count); } @@ -1204,7 +1159,7 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, { struct si_shader_selector *ps = sctx->ps_shader.cso; - key->opt.hw_vs.clip_disable = + key->opt.clip_disable = sctx->queued.named.rasterizer->clip_plane_enable == 0 && (vs->info.clipdist_writemask || vs->info.writes_clipvertex) && @@ -1213,10 +1168,13 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, /* Find out if PS is disabled. */ bool ps_disabled = true; if (ps) { + const struct si_state_blend *blend = sctx->queued.named.blend; + bool alpha_to_coverage = blend && blend->alpha_to_coverage; bool ps_modifies_zs = ps->info.uses_kill || ps->info.writes_z || ps->info.writes_stencil || ps->info.writes_samplemask || + alpha_to_coverage || si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS; unsigned ps_colormask = sctx->framebuffer.colorbuf_enabled_4bit & @@ -1232,22 +1190,19 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, /* Find out which VS outputs aren't used by the PS. */ uint64_t outputs_written = vs->outputs_written; - uint32_t outputs_written2 = vs->outputs_written2; uint64_t inputs_read = 0; - uint32_t inputs_read2 = 0; - outputs_written &= ~0x3; /* ignore POSITION, PSIZE */ + /* ignore POSITION, PSIZE */ + outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0) | + (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0)))); if (!ps_disabled) { inputs_read = ps->inputs_read; - inputs_read2 = ps->inputs_read2; } uint64_t linked = outputs_written & inputs_read; - uint32_t linked2 = outputs_written2 & inputs_read2; - key->opt.hw_vs.kill_outputs = ~linked & outputs_written; - key->opt.hw_vs.kill_outputs2 = ~linked2 & outputs_written2; + key->opt.kill_outputs = ~linked & outputs_written; } /* Compute the key for the hw shader variant */ @@ -1271,7 +1226,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, si_shader_selector_key_hw_vs(sctx, sel, key); if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) - key->part.vs.epilog.export_prim_id = 1; + key->mono.u.vs_export_prim_id = 1; } break; case PIPE_SHADER_TESS_CTRL: @@ -1279,15 +1234,31 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.tcs.ls_prolog); key->part.tcs.ls = sctx->vs_shader.cso; + + /* When the LS VGPR fix is needed, monolithic shaders + * can: + * - avoid initializing EXEC in both the LS prolog + * and the LS main part when !vs_needs_prolog + * - remove the fixup for unused input VGPRs + */ + key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix; + + /* The LS output / HS input layout can be communicated + * directly instead of via user SGPRs for merged LS-HS. + * The LS VGPR fix prefers this too. + */ + key->opt.prefer_mono = 1; } key->part.tcs.epilog.prim_mode = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; + key->part.tcs.epilog.invoc0_tess_factors_are_def = + sel->tcs_info.tessfactors_are_def_in_all_invocs; key->part.tcs.epilog.tes_reads_tess_factors = sctx->tes_shader.cso->info.reads_tess_factors; if (sel == sctx->fixed_func_tcs_shader.cso) - key->mono.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written; + key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written; break; case PIPE_SHADER_TESS_EVAL: if (sctx->gs_shader.cso) @@ -1296,7 +1267,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, si_shader_selector_key_hw_vs(sctx, sel, key); if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) - key->part.tes.epilog.export_prim_id = 1; + key->mono.u.vs_export_prim_id = 1; } break; case PIPE_SHADER_GEOMETRY: @@ -1351,6 +1322,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, sctx->framebuffer.spi_shader_col_format_alpha) | (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit & sctx->framebuffer.spi_shader_col_format); + key->part.ps.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit; /* The output for dual source blending should have * the same format as the first output. @@ -1404,6 +1376,12 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, sctx->framebuffer.nr_samples <= 1; key->part.ps.epilog.clamp_color = rs->clamp_fragment_color; + if (sctx->ps_iter_samples > 1 && + sel->info.reads_samplemask) { + key->part.ps.prolog.samplemask_log_ps_iter = + util_logbase2(util_next_power_of_two(sctx->ps_iter_samples)); + } + if (rs->force_persample_interp && rs->multisample_enable && sctx->framebuffer.nr_samples > 1 && @@ -1435,6 +1413,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, sel->info.uses_linear_center + sel->info.uses_linear_centroid + sel->info.uses_linear_sample > 1; + + if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE]) + key->mono.u.ps.interpolate_at_sample_force_center = 1; } } @@ -1444,11 +1425,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, default: assert(0); } + + if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT))) + memset(&key->opt, 0, sizeof(key->opt)); } -static void si_build_shader_variant(void *job, int thread_index) +static void si_build_shader_variant(struct si_shader *shader, + int thread_index, + bool low_priority) { - struct si_shader *shader = (struct si_shader *)job; struct si_shader_selector *sel = shader->selector; struct si_screen *sscreen = sel->screen; LLVMTargetMachineRef tm; @@ -1456,11 +1441,17 @@ static void si_build_shader_variant(void *job, int thread_index) int r; if (thread_index >= 0) { - assert(thread_index < ARRAY_SIZE(sscreen->tm)); - tm = sscreen->tm[thread_index]; + if (low_priority) { + assert(thread_index < ARRAY_SIZE(sscreen->tm_low_priority)); + tm = sscreen->tm_low_priority[thread_index]; + } else { + assert(thread_index < ARRAY_SIZE(sscreen->tm)); + tm = sscreen->tm[thread_index]; + } if (!debug->async) debug = NULL; } else { + assert(!low_priority); tm = shader->compiler_ctx_state.tm; } @@ -1484,6 +1475,15 @@ static void si_build_shader_variant(void *job, int thread_index) si_shader_init_pm4_state(sscreen, shader); } +static void si_build_shader_variant_low_priority(void *job, int thread_index) +{ + struct si_shader *shader = (struct si_shader *)job; + + assert(thread_index >= 0); + + si_build_shader_variant(shader, thread_index, true); +} + static const struct si_shader_key zeroed; static bool si_check_missing_main_part(struct si_screen *sscreen, @@ -1499,6 +1499,11 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, if (!main_part) return false; + /* We can leave the fence as permanently signaled because the + * main part becomes visible globally only after it has been + * compiled. */ + util_queue_fence_init(&main_part->ready); + main_part->selector = sel; main_part->key.as_es = key->as_es; main_part->key.as_ls = key->as_ls; @@ -1514,19 +1519,6 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, return true; } -static void si_destroy_shader_selector(struct si_context *sctx, - struct si_shader_selector *sel); - -static void si_shader_selector_reference(struct si_context *sctx, - struct si_shader_selector **dst, - struct si_shader_selector *src) -{ - if (pipe_reference(&(*dst)->reference, &src->reference)) - si_destroy_shader_selector(sctx, *dst); - - *dst = src; -} - /* Select the hw shader variant depending on the current state. */ static int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state, @@ -1539,20 +1531,25 @@ static int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader *current = state->current; struct si_shader *iter, *shader = NULL; - if (unlikely(sscreen->b.debug_flags & DBG_NO_OPT_VARIANT)) { - memset(&key->opt, 0, sizeof(key->opt)); - } - again: /* Check if we don't need to change anything. * This path is also used for most shaders that don't need multiple * variants, it will cost just a computation of the key and this * test. */ if (likely(current && - memcmp(¤t->key, key, sizeof(*key)) == 0 && - (!current->is_optimized || - util_queue_fence_is_signalled(¤t->optimized_ready)))) + memcmp(¤t->key, key, sizeof(*key)) == 0)) { + if (unlikely(!util_queue_fence_is_signalled(¤t->ready))) { + if (current->is_optimized) { + memset(&key->opt, 0, sizeof(key->opt)); + goto current_not_ready; + } + + util_queue_fence_wait(¤t->ready); + } + return current->compilation_failed ? -1 : 0; + } +current_not_ready: /* This must be done before the mutex is locked, because async GS * compilation calls this function too, and therefore must enter @@ -1571,24 +1568,26 @@ again: /* Don't check the "current" shader. We checked it above. */ if (current != iter && memcmp(&iter->key, key, sizeof(*key)) == 0) { - /* If it's an optimized shader and its compilation has - * been started but isn't done, use the unoptimized - * shader so as not to cause a stall due to compilation. - */ - if (iter->is_optimized && - !util_queue_fence_is_signalled(&iter->optimized_ready)) { - memset(&key->opt, 0, sizeof(key->opt)); - mtx_unlock(&sel->mutex); - goto again; + mtx_unlock(&sel->mutex); + + if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) { + /* If it's an optimized shader and its compilation has + * been started but isn't done, use the unoptimized + * shader so as not to cause a stall due to compilation. + */ + if (iter->is_optimized) { + memset(&key->opt, 0, sizeof(key->opt)); + goto again; + } + + util_queue_fence_wait(&iter->ready); } if (iter->compilation_failed) { - mtx_unlock(&sel->mutex); return -1; /* skip the draw call */ } state->current = iter; - mtx_unlock(&sel->mutex); return 0; } } @@ -1599,16 +1598,23 @@ again: mtx_unlock(&sel->mutex); return -ENOMEM; } + + util_queue_fence_init(&shader->ready); + shader->selector = sel; shader->key = *key; shader->compiler_ctx_state = *compiler_state; /* If this is a merged shader, get the first shader's selector. */ - if (sscreen->b.chip_class >= GFX9) { + if (sscreen->info.chip_class >= GFX9) { if (sel->type == PIPE_SHADER_TESS_CTRL) previous_stage_sel = key->part.tcs.ls; else if (sel->type == PIPE_SHADER_GEOMETRY) previous_stage_sel = key->part.gs.es; + + /* We need to wait for the previous shader. */ + if (previous_stage_sel && thread_index < 0) + util_queue_fence_wait(&previous_stage_sel->ready); } /* Compile the main shader part if it doesn't exist. This can happen @@ -1637,9 +1643,11 @@ again: else assert(0); + mtx_lock(&previous_stage_sel->mutex); ok = si_check_missing_main_part(sscreen, previous_stage_sel, compiler_state, &shader1_key); + mtx_unlock(&previous_stage_sel->mutex); } else { ok = si_check_missing_main_part(sscreen, sel, compiler_state, key); @@ -1669,25 +1677,25 @@ again: shader->is_optimized = !is_pure_monolithic && memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; - if (shader->is_optimized) - util_queue_fence_init(&shader->optimized_ready); - - if (!sel->last_variant) { - sel->first_variant = shader; - sel->last_variant = shader; - } else { - sel->last_variant->next_variant = shader; - sel->last_variant = shader; - } /* If it's an optimized shader, compile it asynchronously. */ if (shader->is_optimized && !is_pure_monolithic && thread_index < 0) { /* Compile it asynchronously. */ - util_queue_add_job(&sscreen->shader_compiler_queue, - shader, &shader->optimized_ready, - si_build_shader_variant, NULL); + util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, + shader, &shader->ready, + si_build_shader_variant_low_priority, NULL); + + /* Add only after the ready fence was reset, to guard against a + * race with si_bind_XX_shader. */ + if (!sel->last_variant) { + sel->first_variant = shader; + sel->last_variant = shader; + } else { + sel->last_variant->next_variant = shader; + sel->last_variant = shader; + } /* Use the default (unoptimized) shader for now. */ memset(&key->opt, 0, sizeof(key->opt)); @@ -1695,13 +1703,27 @@ again: goto again; } + /* Reset the fence before adding to the variant list. */ + util_queue_fence_reset(&shader->ready); + + if (!sel->last_variant) { + sel->first_variant = shader; + sel->last_variant = shader; + } else { + sel->last_variant->next_variant = shader; + sel->last_variant = shader; + } + + mtx_unlock(&sel->mutex); + assert(!shader->is_optimized); - si_build_shader_variant(shader, thread_index); + si_build_shader_variant(shader, thread_index, false); + + util_queue_fence_signal(&shader->ready); if (!shader->compilation_failed) state->current = shader; - mtx_unlock(&sel->mutex); return shader->compilation_failed ? -1 : 0; } @@ -1718,6 +1740,7 @@ static int si_shader_select(struct pipe_context *ctx, } static void si_parse_next_shader_property(const struct tgsi_shader_info *info, + bool streamout, struct si_shader_key *key) { unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER]; @@ -1733,11 +1756,12 @@ static void si_parse_next_shader_property(const struct tgsi_shader_info *info, key->as_ls = 1; break; default: - /* If POSITION isn't written, it can't be a HW VS. - * Assume that it's a HW LS. (the next shader is TCS) + /* If POSITION isn't written, it can only be a HW VS + * if streamout is used. If streamout isn't used, + * assume that it's a HW LS. (the next shader is TCS) * This heuristic is needed for separate shader objects. */ - if (!info->writes_position) + if (!info->writes_position && !streamout) key->as_ls = 1; } break; @@ -1755,7 +1779,7 @@ static void si_parse_next_shader_property(const struct tgsi_shader_info *info, * si_shader_selector initialization. Since it can be done asynchronously, * there is no way to report compile failures to applications. */ -void si_init_shader_selector_async(void *job, int thread_index) +static void si_init_shader_selector_async(void *job, int thread_index) { struct si_shader_selector *sel = (struct si_shader_selector *)job; struct si_screen *sscreen = sel->screen; @@ -1763,14 +1787,10 @@ void si_init_shader_selector_async(void *job, int thread_index) struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug; unsigned i; - if (thread_index >= 0) { - assert(thread_index < ARRAY_SIZE(sscreen->tm)); - tm = sscreen->tm[thread_index]; - if (!debug->async) - debug = NULL; - } else { - tm = sel->compiler_ctx_state.tm; - } + assert(!debug->debug_message || debug->async); + assert(thread_index >= 0); + assert(thread_index < ARRAY_SIZE(sscreen->tm)); + tm = sscreen->tm[thread_index]; /* Compile the main shader part for use with a prolog and/or epilog. * If this fails, the driver will try to compile a monolithic shader @@ -1778,17 +1798,24 @@ void si_init_shader_selector_async(void *job, int thread_index) */ if (!sscreen->use_monolithic_shaders) { struct si_shader *shader = CALLOC_STRUCT(si_shader); - void *tgsi_binary; + void *tgsi_binary = NULL; if (!shader) { fprintf(stderr, "radeonsi: can't allocate a main shader part\n"); return; } + /* We can leave the fence signaled because use of the default + * main part is guarded by the selector's ready fence. */ + util_queue_fence_init(&shader->ready); + shader->selector = sel; - si_parse_next_shader_property(&sel->info, &shader->key); + si_parse_next_shader_property(&sel->info, + sel->so.num_outputs != 0, + &shader->key); - tgsi_binary = si_get_tgsi_binary(sel); + if (sel->tokens) + tgsi_binary = si_get_tgsi_binary(sel); /* Try to load the shader from the shader cache. */ mtx_lock(&sscreen->shader_cache_mutex); @@ -1844,10 +1871,10 @@ void si_init_shader_selector_async(void *job, int thread_index) switch (name) { case TGSI_SEMANTIC_GENERIC: /* don't process indices the function can't handle */ - if (index >= 60) + if (index >= SI_MAX_IO_GENERIC) break; /* fall through */ - case TGSI_SEMANTIC_CLIPDIST: + default: id = si_shader_io_get_unique_index(name, index); sel->outputs_written &= ~(1ull << id); break; @@ -1856,21 +1883,30 @@ void si_init_shader_selector_async(void *job, int thread_index) case TGSI_SEMANTIC_CLIPVERTEX: case TGSI_SEMANTIC_EDGEFLAG: break; - default: - id = si_shader_io_get_unique_index2(name, index); - sel->outputs_written2 &= ~(1u << id); } } } } /* Pre-compilation. */ - if (sscreen->b.debug_flags & DBG_PRECOMPILE) { + if (sscreen->debug_flags & DBG(PRECOMPILE) && + /* GFX9 needs LS or ES for compilation, which we don't have here. */ + (sscreen->info.chip_class <= VI || + (sel->type != PIPE_SHADER_TESS_CTRL && + sel->type != PIPE_SHADER_GEOMETRY))) { struct si_shader_ctx_state state = {sel}; struct si_shader_key key; memset(&key, 0, sizeof(key)); - si_parse_next_shader_property(&sel->info, &key); + si_parse_next_shader_property(&sel->info, + sel->so.num_outputs != 0, + &key); + + /* GFX9 doesn't have LS and ES. */ + if (sscreen->info.chip_class >= GFX9) { + key.as_ls = 0; + key.as_es = 0; + } /* Set reasonable defaults, so that the shader key doesn't * cause any code to be eliminated. @@ -1912,6 +1948,30 @@ void si_init_shader_selector_async(void *job, int thread_index) } } +/* Return descriptor slot usage masks from the given shader info. */ +void si_get_active_slot_masks(const struct tgsi_shader_info *info, + uint32_t *const_and_shader_buffers, + uint64_t *samplers_and_images) +{ + unsigned start, num_shaderbufs, num_constbufs, num_images, num_samplers; + + num_shaderbufs = util_last_bit(info->shader_buffers_declared); + num_constbufs = util_last_bit(info->const_buffers_declared); + /* two 8-byte images share one 16-byte slot */ + num_images = align(util_last_bit(info->images_declared), 2); + num_samplers = util_last_bit(info->samplers_declared); + + /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */ + start = si_get_shaderbuf_slot(num_shaderbufs - 1); + *const_and_shader_buffers = + u_bit_consecutive(start, num_shaderbufs + num_constbufs); + + /* The layout is: image[last] ... image[0], sampler[0] ... sampler[last] */ + start = si_get_image_slot(num_images - 1) / 2; + *samplers_and_images = + u_bit_consecutive64(start, num_images / 2 + num_samplers); +} + static void *si_create_shader_selector(struct pipe_context *ctx, const struct pipe_shader_state *state) { @@ -1925,23 +1985,54 @@ static void *si_create_shader_selector(struct pipe_context *ctx, pipe_reference_init(&sel->reference, 1); sel->screen = sscreen; - sel->compiler_ctx_state.tm = sctx->tm; - sel->compiler_ctx_state.debug = sctx->b.debug; + sel->compiler_ctx_state.debug = sctx->debug; sel->compiler_ctx_state.is_debug_context = sctx->is_debug; - sel->tokens = tgsi_dup_tokens(state->tokens); - if (!sel->tokens) { - FREE(sel); - return NULL; - } sel->so = state->stream_output; - tgsi_scan_shader(state->tokens, &sel->info); + + if (state->type == PIPE_SHADER_IR_TGSI) { + sel->tokens = tgsi_dup_tokens(state->tokens); + if (!sel->tokens) { + FREE(sel); + return NULL; + } + + tgsi_scan_shader(state->tokens, &sel->info); + tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info); + } else { + assert(state->type == PIPE_SHADER_IR_NIR); + + sel->nir = state->ir.nir; + + si_nir_scan_shader(sel->nir, &sel->info); + si_nir_scan_tess_ctrl(sel->nir, &sel->info, &sel->tcs_info); + + si_lower_nir(sel); + } + sel->type = sel->info.processor; - p_atomic_inc(&sscreen->b.num_shaders_created); + p_atomic_inc(&sscreen->num_shaders_created); + si_get_active_slot_masks(&sel->info, + &sel->active_const_and_shader_buffers, + &sel->active_samplers_and_images); + + /* Record which streamout buffers are enabled. */ + for (i = 0; i < sel->so.num_outputs; i++) { + sel->enabled_streamout_buffer_mask |= + (1 << sel->so.output[i].output_buffer) << + (sel->so.output[i].stream * 4); + } /* The prolog is a no-op if there are no inputs. */ sel->vs_needs_prolog = sel->type == PIPE_SHADER_VERTEX && - sel->info.num_inputs; + sel->info.num_inputs && + !sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS]; + + sel->force_correct_derivs_after_kill = + sel->type == PIPE_SHADER_FRAGMENT && + sel->info.uses_derivatives && + sel->info.uses_kill && + sctx->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL); /* Set which opcode uses which (i,j) pair. */ if (sel->info.uses_persp_opcode_interp_centroid) @@ -1982,8 +2073,8 @@ static void *si_create_shader_selector(struct pipe_context *ctx, case PIPE_SHADER_TESS_CTRL: /* Always reserve space for these. */ sel->patch_outputs_written |= - (1llu << si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0)) | - (1llu << si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0)); + (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0)) | + (1ull << si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0)); /* fall through */ case PIPE_SHADER_VERTEX: case PIPE_SHADER_TESS_EVAL: @@ -1996,26 +2087,21 @@ static void *si_create_shader_selector(struct pipe_context *ctx, case TGSI_SEMANTIC_TESSOUTER: case TGSI_SEMANTIC_PATCH: sel->patch_outputs_written |= - 1llu << si_shader_io_get_unique_index(name, index); + 1ull << si_shader_io_get_unique_index_patch(name, index); break; case TGSI_SEMANTIC_GENERIC: /* don't process indices the function can't handle */ - if (index >= 60) + if (index >= SI_MAX_IO_GENERIC) break; /* fall through */ - case TGSI_SEMANTIC_POSITION: - case TGSI_SEMANTIC_PSIZE: - case TGSI_SEMANTIC_CLIPDIST: + default: sel->outputs_written |= - 1llu << si_shader_io_get_unique_index(name, index); + 1ull << si_shader_io_get_unique_index(name, index); break; case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */ case TGSI_SEMANTIC_EDGEFLAG: break; - default: - sel->outputs_written2 |= - 1u << si_shader_io_get_unique_index2(name, index); } } sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; @@ -2033,16 +2119,17 @@ static void *si_create_shader_selector(struct pipe_context *ctx, unsigned index = sel->info.input_semantic_index[i]; switch (name) { - case TGSI_SEMANTIC_CLIPDIST: case TGSI_SEMANTIC_GENERIC: + /* don't process indices the function can't handle */ + if (index >= SI_MAX_IO_GENERIC) + break; + /* fall through */ + default: sel->inputs_read |= - 1llu << si_shader_io_get_unique_index(name, index); + 1ull << si_shader_io_get_unique_index(name, index); break; case TGSI_SEMANTIC_PCOORD: /* ignore this */ break; - default: - sel->inputs_read2 |= - 1u << si_shader_io_get_unique_index2(name, index); } } @@ -2059,6 +2146,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx, break; } + /* PA_CL_VS_OUT_CNTL */ + bool misc_vec_ena = + sel->info.writes_psize || sel->info.writes_edgeflag || + sel->info.writes_layer || sel->info.writes_viewport_index; + sel->pa_cl_vs_out_cntl = + S_02881C_USE_VTX_POINT_SIZE(sel->info.writes_psize) | + S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag) | + S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) | + S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) | + S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) | + S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena); + sel->clipdist_mask = sel->info.writes_clipvertex ? + SIX_BITS : sel->info.clipdist_writemask; + sel->culldist_mask = sel->info.culldist_writemask << + sel->info.num_written_clipdistance; + /* DB_SHADER_CONTROL */ sel->db_shader_control = S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) | @@ -2112,21 +2215,84 @@ static void *si_create_shader_selector(struct pipe_context *ctx, (void) mtx_init(&sel->mutex, mtx_plain); util_queue_fence_init(&sel->ready); - if ((sctx->b.debug.debug_message && !sctx->b.debug.async) || - sctx->is_debug || - r600_can_dump_shader(&sscreen->b, sel->info.processor)) - si_init_shader_selector_async(sel, -1); - else - util_queue_add_job(&sscreen->shader_compiler_queue, sel, - &sel->ready, si_init_shader_selector_async, - NULL); + struct util_async_debug_callback async_debug; + bool wait = + (sctx->debug.debug_message && !sctx->debug.async) || + sctx->is_debug || + si_can_dump_shader(sscreen, sel->info.processor); + + if (wait) { + u_async_debug_init(&async_debug); + sel->compiler_ctx_state.debug = async_debug.base; + } + + util_queue_add_job(&sscreen->shader_compiler_queue, sel, + &sel->ready, si_init_shader_selector_async, + NULL); + + if (wait) { + util_queue_fence_wait(&sel->ready); + u_async_debug_drain(&async_debug, &sctx->debug); + u_async_debug_cleanup(&async_debug); + } return sel; } +static void si_update_streamout_state(struct si_context *sctx) +{ + struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso; + + if (!shader_with_so) + return; + + sctx->streamout.enabled_stream_buffers_mask = + shader_with_so->enabled_streamout_buffer_mask; + sctx->streamout.stride_in_dw = shader_with_so->so.stride; +} + +static void si_update_clip_regs(struct si_context *sctx, + struct si_shader_selector *old_hw_vs, + struct si_shader *old_hw_vs_variant, + struct si_shader_selector *next_hw_vs, + struct si_shader *next_hw_vs_variant) +{ + if (next_hw_vs && + (!old_hw_vs || + old_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] != + next_hw_vs->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] || + old_hw_vs->pa_cl_vs_out_cntl != next_hw_vs->pa_cl_vs_out_cntl || + old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask || + old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || + !old_hw_vs_variant || + !next_hw_vs_variant || + old_hw_vs_variant->key.opt.clip_disable != + next_hw_vs_variant->key.opt.clip_disable)) + si_mark_atom_dirty(sctx, &sctx->clip_regs); +} + +static void si_update_common_shader_state(struct si_context *sctx) +{ + sctx->uses_bindless_samplers = + si_shader_uses_bindless_samplers(sctx->vs_shader.cso) || + si_shader_uses_bindless_samplers(sctx->gs_shader.cso) || + si_shader_uses_bindless_samplers(sctx->ps_shader.cso) || + si_shader_uses_bindless_samplers(sctx->tcs_shader.cso) || + si_shader_uses_bindless_samplers(sctx->tes_shader.cso); + sctx->uses_bindless_images = + si_shader_uses_bindless_images(sctx->vs_shader.cso) || + si_shader_uses_bindless_images(sctx->gs_shader.cso) || + si_shader_uses_bindless_images(sctx->ps_shader.cso) || + si_shader_uses_bindless_images(sctx->tcs_shader.cso) || + si_shader_uses_bindless_images(sctx->tes_shader.cso); + sctx->do_update_shaders = true; +} + static void si_bind_vs_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; + struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); struct si_shader_selector *sel = state; if (sctx->vs_shader.cso == sel) @@ -2134,14 +2300,34 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->vs_shader.cso = sel; sctx->vs_shader.current = sel ? sel->first_variant : NULL; - sctx->do_update_shaders = true; - si_mark_atom_dirty(sctx, &sctx->clip_regs); - r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + sctx->num_vs_blit_sgprs = sel ? sel->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS] : 0; + + si_update_common_shader_state(sctx); + si_update_vs_viewport_state(sctx); + si_set_active_descriptors_for_shader(sctx, sel); + si_update_streamout_state(sctx); + si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, + si_get_vs(sctx)->cso, si_get_vs_state(sctx)); +} + +static void si_update_tess_uses_prim_id(struct si_context *sctx) +{ + sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id = + (sctx->tes_shader.cso && + sctx->tes_shader.cso->info.uses_primid) || + (sctx->tcs_shader.cso && + sctx->tcs_shader.cso->info.uses_primid) || + (sctx->gs_shader.cso && + sctx->gs_shader.cso->info.uses_primid) || + (sctx->ps_shader.cso && !sctx->gs_shader.cso && + sctx->ps_shader.cso->info.uses_primid); } static void si_bind_gs_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; + struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); struct si_shader_selector *sel = state; bool enable_changed = !!sctx->gs_shader.cso != !!sel; @@ -2151,22 +2337,20 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state) sctx->gs_shader.cso = sel; sctx->gs_shader.current = sel ? sel->first_variant : NULL; sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL; - sctx->do_update_shaders = true; - si_mark_atom_dirty(sctx, &sctx->clip_regs); + + si_update_common_shader_state(sctx); sctx->last_rast_prim = -1; /* reset this so that it gets updated */ - if (enable_changed) + if (enable_changed) { si_shader_change_notify(sctx); - r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); -} - -static void si_update_tcs_tes_uses_prim_id(struct si_context *sctx) -{ - sctx->ia_multi_vgt_param_key.u.tcs_tes_uses_prim_id = - (sctx->tes_shader.cso && - sctx->tes_shader.cso->info.uses_primid) || - (sctx->tcs_shader.cso && - sctx->tcs_shader.cso->info.uses_primid); + if (sctx->ia_multi_vgt_param_key.u.uses_tess) + si_update_tess_uses_prim_id(sctx); + } + si_update_vs_viewport_state(sctx); + si_set_active_descriptors_for_shader(sctx, sel); + si_update_streamout_state(sctx); + si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, + si_get_vs(sctx)->cso, si_get_vs_state(sctx)); } static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) @@ -2180,16 +2364,21 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) sctx->tcs_shader.cso = sel; sctx->tcs_shader.current = sel ? sel->first_variant : NULL; - si_update_tcs_tes_uses_prim_id(sctx); - sctx->do_update_shaders = true; + si_update_tess_uses_prim_id(sctx); + + si_update_common_shader_state(sctx); if (enable_changed) sctx->last_tcs = NULL; /* invalidate derived tess state */ + + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_tes_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso; + struct si_shader *old_hw_vs_variant = si_get_vs_state(sctx); struct si_shader_selector *sel = state; bool enable_changed = !!sctx->tes_shader.cso != !!sel; @@ -2199,40 +2388,63 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) sctx->tes_shader.cso = sel; sctx->tes_shader.current = sel ? sel->first_variant : NULL; sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL; - si_update_tcs_tes_uses_prim_id(sctx); - sctx->do_update_shaders = true; - si_mark_atom_dirty(sctx, &sctx->clip_regs); + si_update_tess_uses_prim_id(sctx); + + si_update_common_shader_state(sctx); sctx->last_rast_prim = -1; /* reset this so that it gets updated */ if (enable_changed) { si_shader_change_notify(sctx); sctx->last_tes_sh_base = -1; /* invalidate derived tess state */ } - r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + si_update_vs_viewport_state(sctx); + si_set_active_descriptors_for_shader(sctx, sel); + si_update_streamout_state(sctx); + si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, + si_get_vs(sctx)->cso, si_get_vs_state(sctx)); } static void si_bind_ps_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_shader_selector *old_sel = sctx->ps_shader.cso; struct si_shader_selector *sel = state; /* skip if supplied shader is one already in use */ - if (sctx->ps_shader.cso == sel) + if (old_sel == sel) return; sctx->ps_shader.cso = sel; sctx->ps_shader.current = sel ? sel->first_variant : NULL; - sctx->do_update_shaders = true; - si_mark_atom_dirty(sctx, &sctx->cb_render_state); + + si_update_common_shader_state(sctx); + if (sel) { + if (sctx->ia_multi_vgt_param_key.u.uses_tess) + si_update_tess_uses_prim_id(sctx); + + if (!old_sel || + old_sel->info.colors_written != sel->info.colors_written) + si_mark_atom_dirty(sctx, &sctx->cb_render_state); + + if (sctx->screen->has_out_of_order_rast && + (!old_sel || + old_sel->info.writes_memory != sel->info.writes_memory || + old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] != + sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])) + si_mark_atom_dirty(sctx, &sctx->msaa_config); + } + si_set_active_descriptors_for_shader(sctx, sel); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) { if (shader->is_optimized) { - util_queue_fence_wait(&shader->optimized_ready); - util_queue_fence_destroy(&shader->optimized_ready); + util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, + &shader->ready); } + util_queue_fence_destroy(&shader->ready); + if (shader->pm4) { switch (shader->selector->type) { case PIPE_SHADER_VERTEX: @@ -2274,8 +2486,8 @@ static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) free(shader); } -static void si_destroy_shader_selector(struct si_context *sctx, - struct si_shader_selector *sel) +void si_destroy_shader_selector(struct si_context *sctx, + struct si_shader_selector *sel) { struct si_shader *p = sel->first_variant, *c; struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = { @@ -2286,7 +2498,7 @@ static void si_destroy_shader_selector(struct si_context *sctx, [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader, }; - util_queue_fence_wait(&sel->ready); + util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready); if (current_shader[sel->type]->cso == sel) { current_shader[sel->type]->cso = NULL; @@ -2311,6 +2523,7 @@ static void si_destroy_shader_selector(struct si_context *sctx, util_queue_fence_destroy(&sel->ready); mtx_destroy(&sel->mutex); free(sel->tokens); + ralloc_free(sel->nir); free(sel); } @@ -2455,7 +2668,7 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) struct si_pm4_state *pm4; /* Chip constants. */ - unsigned num_se = sctx->screen->b.info.max_se; + unsigned num_se = sctx->screen->info.max_se; unsigned wave_size = 64; unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */ /* On SI-CI, the value comes from VGT_GS_VERTEX_REUSE = 16. @@ -2502,7 +2715,7 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) if (update_esgs) { pipe_resource_reference(&sctx->esgs_ring, NULL); sctx->esgs_ring = - r600_aligned_buffer_create(sctx->b.b.screen, + si_aligned_buffer_create(sctx->b.b.screen, R600_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, esgs_ring_size, alignment); @@ -2513,7 +2726,7 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) if (update_gsvs) { pipe_resource_reference(&sctx->gsvs_ring, NULL); sctx->gsvs_ring = - r600_aligned_buffer_create(sctx->b.b.screen, + si_aligned_buffer_create(sctx->b.b.screen, R600_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, gsvs_ring_size, alignment); @@ -2556,7 +2769,7 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) /* Flush the context to re-emit both init_config states. */ sctx->b.initial_gfx_cs_size = 0; /* force flush */ - si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL); + si_context_gfx_flush(sctx, PIPE_FLUSH_ASYNC, NULL); /* Set ring bindings. */ if (sctx->esgs_ring) { @@ -2577,6 +2790,22 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) return true; } +static void si_shader_lock(struct si_shader *shader) +{ + mtx_lock(&shader->selector->mutex); + if (shader->previous_stage_sel) { + assert(shader->previous_stage_sel != shader->selector); + mtx_lock(&shader->previous_stage_sel->mutex); + } +} + +static void si_shader_unlock(struct si_shader *shader) +{ + if (shader->previous_stage_sel) + mtx_unlock(&shader->previous_stage_sel->mutex); + mtx_unlock(&shader->selector->mutex); +} + /** * @returns 1 if \p sel has been updated to use a new scratch buffer * 0 if not @@ -2595,25 +2824,40 @@ static int si_update_scratch_buffer(struct si_context *sctx, if (shader->config.scratch_bytes_per_wave == 0) return 0; + /* Prevent race conditions when updating: + * - si_shader::scratch_bo + * - si_shader::binary::code + * - si_shader::previous_stage::binary::code. + */ + si_shader_lock(shader); + /* This shader is already configured to use the current * scratch buffer. */ - if (shader->scratch_bo == sctx->scratch_buffer) + if (shader->scratch_bo == sctx->scratch_buffer) { + si_shader_unlock(shader); return 0; + } assert(sctx->scratch_buffer); - si_shader_apply_scratch_relocs(sctx, shader, &shader->config, scratch_va); + if (shader->previous_stage) + si_shader_apply_scratch_relocs(shader->previous_stage, scratch_va); + + si_shader_apply_scratch_relocs(shader, scratch_va); /* Replace the shader bo with a new bo that has the relocs applied. */ r = si_shader_binary_upload(sctx->screen, shader); - if (r) + if (r) { + si_shader_unlock(shader); return r; + } /* Update the shader state to use the new shader bo. */ si_shader_init_pm4_state(sctx->screen, shader); r600_resource_reference(&shader->scratch_bo, sctx->scratch_buffer); + si_shader_unlock(shader); return 1; } @@ -2627,6 +2871,15 @@ static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader) return shader ? shader->config.scratch_bytes_per_wave : 0; } +static struct si_shader *si_get_tcs_current(struct si_context *sctx) +{ + if (!sctx->tes_shader.cso) + return NULL; /* tessellation disabled */ + + return sctx->tcs_shader.cso ? sctx->tcs_shader.current : + sctx->fixed_func_tcs_shader.current; +} + static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) { unsigned bytes = 0; @@ -2634,11 +2887,71 @@ static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current)); bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current)); bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current)); - bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current)); bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current)); + + if (sctx->tes_shader.cso) { + struct si_shader *tcs = si_get_tcs_current(sctx); + + bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(tcs)); + } return bytes; } +static bool si_update_scratch_relocs(struct si_context *sctx) +{ + struct si_shader *tcs = si_get_tcs_current(sctx); + int r; + + /* Update the shaders, so that they are using the latest scratch. + * The scratch buffer may have been changed since these shaders were + * last used, so we still need to try to update them, even if they + * require scratch buffers smaller than the current size. + */ + r = si_update_scratch_buffer(sctx, sctx->ps_shader.current); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); + + r = si_update_scratch_buffer(sctx, sctx->gs_shader.current); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); + + r = si_update_scratch_buffer(sctx, tcs); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, hs, tcs->pm4); + + /* VS can be bound as LS, ES, or VS. */ + r = si_update_scratch_buffer(sctx, sctx->vs_shader.current); + if (r < 0) + return false; + if (r == 1) { + if (sctx->tes_shader.current) + si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); + else if (sctx->gs_shader.current) + si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); + else + si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); + } + + /* TES can be bound as ES or VS. */ + r = si_update_scratch_buffer(sctx, sctx->tes_shader.current); + if (r < 0) + return false; + if (r == 1) { + if (sctx->gs_shader.current) + si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); + else + si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); + } + + return true; +} + static bool si_update_spi_tmpring_size(struct si_context *sctx) { unsigned current_scratch_buffer_size = @@ -2648,7 +2961,6 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) unsigned scratch_needed_size = scratch_bytes_per_wave * sctx->scratch_waves; unsigned spi_tmpring_size; - int r; if (scratch_needed_size > 0) { if (scratch_needed_size > current_scratch_buffer_size) { @@ -2656,7 +2968,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) r600_resource_reference(&sctx->scratch_buffer, NULL); sctx->scratch_buffer = (struct r600_resource*) - r600_aligned_buffer_create(&sctx->screen->b.b, + si_aligned_buffer_create(&sctx->screen->b, R600_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, scratch_needed_size, 256); @@ -2664,56 +2976,12 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx) return false; si_mark_atom_dirty(sctx, &sctx->scratch_state); - r600_context_add_resource_size(&sctx->b.b, - &sctx->scratch_buffer->b.b); - } - - /* Update the shaders, so they are using the latest scratch. The - * scratch buffer may have been changed since these shaders were - * last used, so we still need to try to update them, even if - * they require scratch buffers smaller than the current size. - */ - r = si_update_scratch_buffer(sctx, sctx->ps_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); - - r = si_update_scratch_buffer(sctx, sctx->gs_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); - - r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4); - - /* VS can be bound as LS, ES, or VS. */ - r = si_update_scratch_buffer(sctx, sctx->vs_shader.current); - if (r < 0) - return false; - if (r == 1) { - if (sctx->tes_shader.current) - si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); - else if (sctx->gs_shader.current) - si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); - else - si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); + si_context_add_resource_size(&sctx->b.b, + &sctx->scratch_buffer->b.b); } - /* TES can be bound as ES or VS. */ - r = si_update_scratch_buffer(sctx, sctx->tes_shader.current); - if (r < 0) + if (!si_update_scratch_relocs(sctx)) return false; - if (r == 1) { - if (sctx->gs_shader.current) - si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); - else - si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); - } } /* The LLVM shader backend should be reporting aligned scratch_sizes. */ @@ -2734,9 +3002,12 @@ static void si_init_tess_factor_ring(struct si_context *sctx) bool double_offchip_buffers = sctx->b.chip_class >= CIK && sctx->b.family != CHIP_CARRIZO && sctx->b.family != CHIP_STONEY; - unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; + /* This must be one less than the maximum number due to a hw limitation. + * Various hardware bugs in SI, CIK, and GFX9 need this. + */ + unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63; unsigned max_offchip_buffers = max_offchip_buffers_per_se * - sctx->screen->b.info.max_se; + sctx->screen->info.max_se; unsigned offchip_granularity; switch (sctx->screen->tess_offchip_block_dw_size) { @@ -2751,43 +3022,42 @@ static void si_init_tess_factor_ring(struct si_context *sctx) break; } - switch (sctx->b.chip_class) { - case SI: - max_offchip_buffers = MIN2(max_offchip_buffers, 126); - break; - case CIK: - case VI: - case GFX9: - max_offchip_buffers = MIN2(max_offchip_buffers, 508); - break; - default: - assert(0); - return; - } - assert(!sctx->tf_ring); - sctx->tf_ring = r600_aligned_buffer_create(sctx->b.b.screen, + /* Use 64K alignment for both rings, so that we can pass the address + * to shaders as one SGPR containing bits [16:47]. + */ + sctx->tf_ring = si_aligned_buffer_create(sctx->b.b.screen, R600_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, - 32768 * sctx->screen->b.info.max_se, - 256); + 32768 * sctx->screen->info.max_se, + 64 * 1024); if (!sctx->tf_ring) return; assert(((sctx->tf_ring->width0 / 4) & C_030938_SIZE) == 0); sctx->tess_offchip_ring = - r600_aligned_buffer_create(sctx->b.b.screen, + si_aligned_buffer_create(sctx->b.b.screen, R600_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT, max_offchip_buffers * sctx->screen->tess_offchip_block_dw_size * 4, - 256); + 64 * 1024); if (!sctx->tess_offchip_ring) return; si_init_config_add_vgt_flush(sctx); + uint64_t offchip_va = r600_resource(sctx->tess_offchip_ring)->gpu_address; + uint64_t factor_va = r600_resource(sctx->tf_ring)->gpu_address; + assert((offchip_va & 0xffff) == 0); + assert((factor_va & 0xffff) == 0); + + si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tess_offchip_ring), + RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS); + si_pm4_add_bo(sctx->init_config, r600_resource(sctx->tf_ring), + RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS); + /* Append these registers to the init config state. */ if (sctx->b.chip_class >= CIK) { if (sctx->b.chip_class >= VI) @@ -2796,10 +3066,10 @@ static void si_init_tess_factor_ring(struct si_context *sctx) si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE, S_030938_SIZE(sctx->tf_ring->width0 / 4)); si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE, - r600_resource(sctx->tf_ring)->gpu_address >> 8); + factor_va >> 8); if (sctx->b.chip_class >= GFX9) si_pm4_set_reg(sctx->init_config, R_030944_VGT_TF_MEMORY_BASE_HI, - r600_resource(sctx->tf_ring)->gpu_address >> 40); + factor_va >> 40); si_pm4_set_reg(sctx->init_config, R_03093C_VGT_HS_OFFCHIP_PARAM, S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) | S_03093C_OFFCHIP_GRANULARITY(offchip_granularity)); @@ -2808,24 +3078,37 @@ static void si_init_tess_factor_ring(struct si_context *sctx) si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(sctx->tf_ring->width0 / 4)); si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE, - r600_resource(sctx->tf_ring)->gpu_address >> 8); + factor_va >> 8); si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM, S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers)); } + if (sctx->b.chip_class >= GFX9) { + si_pm4_set_reg(sctx->init_config, + R_00B430_SPI_SHADER_USER_DATA_LS_0 + + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4, + offchip_va >> 16); + si_pm4_set_reg(sctx->init_config, + R_00B430_SPI_SHADER_USER_DATA_LS_0 + + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K * 4, + factor_va >> 16); + } else { + si_pm4_set_reg(sctx->init_config, + R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K * 4, + offchip_va >> 16); + si_pm4_set_reg(sctx->init_config, + R_00B430_SPI_SHADER_USER_DATA_HS_0 + + GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K * 4, + factor_va >> 16); + } + /* Flush the context to re-emit the init_config state. * This is done only once in a lifetime of a context. */ si_pm4_upload_indirect_buffer(sctx, sctx->init_config); sctx->b.initial_gfx_cs_size = 0; /* force flush */ - si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL); - - si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_FACTOR, sctx->tf_ring, - 0, sctx->tf_ring->width0, false, false, 0, 0, 0); - - si_set_ring_buffer(&sctx->b.b, SI_HS_RING_TESS_OFFCHIP, - sctx->tess_offchip_ring, 0, - sctx->tess_offchip_ring->width0, false, false, 0, 0, 0); + si_context_gfx_flush(sctx, PIPE_FLUSH_ASYNC, NULL); } /** @@ -2896,29 +3179,20 @@ static void si_update_vgt_shader_config(struct si_context *sctx) si_pm4_bind_state(sctx, vgt_shader_config, *pm4); } -static void si_update_so(struct si_context *sctx, struct si_shader_selector *shader) -{ - struct pipe_stream_output_info *so = &shader->so; - uint32_t enabled_stream_buffers_mask = 0; - int i; - - for (i = 0; i < so->num_outputs; i++) - enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4); - sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask; - sctx->b.streamout.stride_in_dw = shader->so.stride; -} - bool si_update_shaders(struct si_context *sctx) { struct pipe_context *ctx = (struct pipe_context*)sctx; struct si_compiler_ctx_state compiler_state; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct si_shader *old_vs = si_get_vs_state(sctx); - bool old_clip_disable = old_vs ? old_vs->key.opt.hw_vs.clip_disable : false; + bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false; + struct si_shader *old_ps = sctx->ps_shader.current; + unsigned old_spi_shader_col_format = + old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0; int r; compiler_state.tm = sctx->tm; - compiler_state.debug = sctx->b.debug; + compiler_state.debug = sctx->debug; compiler_state.is_debug_context = sctx->is_debug; /* Update stages before GS. */ @@ -2975,7 +3249,6 @@ bool si_update_shaders(struct si_context *sctx) if (r) return false; si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); - si_update_so(sctx, sctx->tes_shader.cso); } } else if (sctx->gs_shader.cso) { if (sctx->b.chip_class <= VI) { @@ -2995,8 +3268,6 @@ bool si_update_shaders(struct si_context *sctx) if (r) return false; si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); - si_update_so(sctx, sctx->vs_shader.cso); - si_pm4_bind_state(sctx, ls, NULL); si_pm4_bind_state(sctx, hs, NULL); } @@ -3008,7 +3279,6 @@ bool si_update_shaders(struct si_context *sctx) return false; si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); si_pm4_bind_state(sctx, vs, sctx->gs_shader.cso->gs_copy_shader->pm4); - si_update_so(sctx, sctx->gs_shader.cso); if (!si_update_gs_ring_buffers(sctx)) return false; @@ -3020,7 +3290,7 @@ bool si_update_shaders(struct si_context *sctx) si_update_vgt_shader_config(sctx); - if (old_clip_disable != si_get_vs_state(sctx)->key.opt.hw_vs.clip_disable) + if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable) si_mark_atom_dirty(sctx, &sctx->clip_regs); if (sctx->ps_shader.cso) { @@ -3043,12 +3313,18 @@ bool si_update_shaders(struct si_context *sctx) si_mark_atom_dirty(sctx, &sctx->spi_map); } - if (sctx->screen->b.rbplus_allowed && si_pm4_state_changed(sctx, ps)) + if (sctx->screen->rbplus_allowed && + si_pm4_state_changed(sctx, ps) && + (!old_ps || + old_spi_shader_col_format != + sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format)) si_mark_atom_dirty(sctx, &sctx->cb_render_state); if (sctx->ps_db_shader_control != db_shader_control) { sctx->ps_db_shader_control = db_shader_control; si_mark_atom_dirty(sctx, &sctx->db_render_state); + if (sctx->screen->dpbb_allowed) + si_mark_atom_dirty(sctx, &sctx->dpbb_state); } if (sctx->smoothing_enabled != sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) { @@ -3063,18 +3339,47 @@ bool si_update_shaders(struct si_context *sctx) } } - if (si_pm4_state_changed(sctx, ls) || - si_pm4_state_changed(sctx, hs) || - si_pm4_state_changed(sctx, es) || - si_pm4_state_changed(sctx, gs) || - si_pm4_state_changed(sctx, vs) || - si_pm4_state_changed(sctx, ps)) { + if (si_pm4_state_enabled_and_changed(sctx, ls) || + si_pm4_state_enabled_and_changed(sctx, hs) || + si_pm4_state_enabled_and_changed(sctx, es) || + si_pm4_state_enabled_and_changed(sctx, gs) || + si_pm4_state_enabled_and_changed(sctx, vs) || + si_pm4_state_enabled_and_changed(sctx, ps)) { if (!si_update_spi_tmpring_size(sctx)) return false; } - if (sctx->b.chip_class >= CIK) - si_mark_atom_dirty(sctx, &sctx->prefetch_L2); + if (sctx->b.chip_class >= CIK) { + if (si_pm4_state_enabled_and_changed(sctx, ls)) + sctx->prefetch_L2_mask |= SI_PREFETCH_LS; + else if (!sctx->queued.named.ls) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS; + + if (si_pm4_state_enabled_and_changed(sctx, hs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_HS; + else if (!sctx->queued.named.hs) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS; + + if (si_pm4_state_enabled_and_changed(sctx, es)) + sctx->prefetch_L2_mask |= SI_PREFETCH_ES; + else if (!sctx->queued.named.es) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES; + + if (si_pm4_state_enabled_and_changed(sctx, gs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_GS; + else if (!sctx->queued.named.gs) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS; + + if (si_pm4_state_enabled_and_changed(sctx, vs)) + sctx->prefetch_L2_mask |= SI_PREFETCH_VS; + else if (!sctx->queued.named.vs) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS; + + if (si_pm4_state_enabled_and_changed(sctx, ps)) + sctx->prefetch_L2_mask |= SI_PREFETCH_PS; + else if (!sctx->queued.named.ps) + sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS; + } sctx->do_update_shaders = false; return true; @@ -3095,6 +3400,71 @@ static void si_emit_scratch_state(struct si_context *sctx, } } +void *si_get_blit_vs(struct si_context *sctx, enum blitter_attrib_type type, + unsigned num_layers) +{ + struct pipe_context *pipe = &sctx->b.b; + unsigned vs_blit_property; + void **vs; + + switch (type) { + case UTIL_BLITTER_ATTRIB_NONE: + vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : + &sctx->vs_blit_pos; + vs_blit_property = SI_VS_BLIT_SGPRS_POS; + break; + case UTIL_BLITTER_ATTRIB_COLOR: + vs = num_layers > 1 ? &sctx->vs_blit_color_layered : + &sctx->vs_blit_color; + vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR; + break; + case UTIL_BLITTER_ATTRIB_TEXCOORD_XY: + case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW: + assert(num_layers == 1); + vs = &sctx->vs_blit_texcoord; + vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD; + break; + default: + assert(0); + return NULL; + } + if (*vs) + return *vs; + + struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX); + if (!ureg) + return NULL; + + /* Tell the shader to load VS inputs from SGPRs: */ + ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS, vs_blit_property); + ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true); + + /* This is just a pass-through shader with 1-3 MOV instructions. */ + ureg_MOV(ureg, + ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), + ureg_DECL_vs_input(ureg, 0)); + + if (type != UTIL_BLITTER_ATTRIB_NONE) { + ureg_MOV(ureg, + ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), + ureg_DECL_vs_input(ureg, 1)); + } + + if (num_layers > 1) { + struct ureg_src instance_id = + ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0); + struct ureg_dst layer = + ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0); + + ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X), + ureg_scalar(instance_id, TGSI_SWIZZLE_X)); + } + ureg_END(ureg); + + *vs = ureg_create_shader_and_destroy(ureg, pipe); + return *vs; +} + void si_init_shader_functions(struct si_context *sctx) { si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);