X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state.c;h=9b913212e4ad9b1eaa1c6a83629b61350c804620;hb=5fa2ab831e395a73247f6278a3d103608d1e7c7a;hp=68ba7d6403e7ea76e8f7a5c9529df8d00b5693e5;hpb=bcd2d2e1942ab7158dd46a5223130498cb0a8f44;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 68ba7d6403e..9b913212e4a 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -27,8 +27,8 @@ #include "si_query.h" #include "util/u_dual_blend.h" -#include "util/u_format.h" -#include "util/u_format_s3tc.h" +#include "util/format/u_format.h" +#include "util/format/u_format_s3tc.h" #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" @@ -82,19 +82,17 @@ static void si_emit_cb_render_state(struct si_context *sctx) struct si_state_blend *blend = sctx->queued.named.blend; /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers, * but you never know. */ - uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit; + uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & + blend->cb_target_mask; unsigned i; - if (blend) - cb_target_mask &= blend->cb_target_mask; - /* Avoid a hang that happens when dual source blending is enabled * but there is not enough color outputs. This is undefined behavior, * so disable color writes completely. * * Reproducible with Unigine Heaven 4.0 and drirc missing. */ - if (blend && blend->dual_src_blend && + if (blend->dual_src_blend && sctx->ps_shader.cso && (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) cb_target_mask = 0; @@ -102,12 +100,12 @@ static void si_emit_cb_render_state(struct si_context *sctx) /* GFX9: Flush DFSM when CB_TARGET_MASK changes. * I think we don't have to do anything between IBs. */ - if (sctx->screen->dfsm_allowed && + if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) { sctx->last_cb_target_mask = cb_target_mask; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } unsigned initial_cdw = cs->current.cdw; @@ -115,26 +113,25 @@ static void si_emit_cb_render_state(struct si_context *sctx) SI_TRACKED_CB_TARGET_MASK, cb_target_mask); if (sctx->chip_class >= GFX8) { - /* DCC MSAA workaround for blending. + /* DCC MSAA workaround. * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- * COMBINER_DISABLE, but that would be more complicated. */ - bool oc_disable = blend && - blend->blend_enable_4bit & cb_target_mask && + bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2; unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark; radeon_opt_set_context_reg( sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, - S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) | + S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) | S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) | - S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->has_dcc_constant_encode)); + S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode)); } /* RB+ register settings. */ - if (sctx->screen->rbplus_allowed) { + if (sctx->screen->info.rbplus_allowed) { unsigned spi_shader_col_format = sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0; @@ -148,8 +145,15 @@ static void si_emit_cb_render_state(struct si_context *sctx) unsigned format, swap, spi_format, colormask; bool has_alpha, has_rgb; - if (!surf) + if (!surf) { + /* If the color buffer is not set, the driver sets 32_R + * as the SPI color format, because the hw doesn't allow + * holes between color outputs, so also set this to + * enable RB+. + */ + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); continue; + } format = G_028C70_FORMAT(surf->cb_color_info); swap = G_028C70_COMP_SWAP(surf->cb_color_info); @@ -243,10 +247,8 @@ static void si_emit_cb_render_state(struct si_context *sctx) break; case V_028C70_COLOR_10_11_11: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); - sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4); - } break; case V_028C70_COLOR_2_10_10_10: @@ -258,6 +260,12 @@ static void si_emit_cb_render_state(struct si_context *sctx) } } + /* If there are no color outputs, the first color export is + * always enabled as 32_R, so also set this to enable RB+. + */ + if (!sx_ps_downconvert) + sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; + /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, @@ -466,6 +474,8 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, struct si_pm4_state *pm4 = &blend->pm4; uint32_t sx_mrt_blend_opt[8] = {0}; uint32_t color_control = 0; + bool logicop_enable = state->logicop_enable && + state->logicop_func != PIPE_LOGICOP_COPY; if (!blend) return NULL; @@ -473,9 +483,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->alpha_to_coverage = state->alpha_to_coverage; blend->alpha_to_one = state->alpha_to_one; blend->dual_src_blend = util_blend_state_is_dual(state, 0); - blend->logicop_enable = state->logicop_enable; + blend->logicop_enable = logicop_enable; - if (state->logicop_enable) { + if (logicop_enable) { color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); } else { color_control |= S_028808_ROP3(0xcc); @@ -608,6 +618,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->blend_enable_4bit |= 0xfu << (i * 4); + if (sctx->family <= CHIP_NAVI14) + blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4); + /* This is only important for formats without alpha. */ if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || @@ -618,13 +631,16 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->need_src_alpha_4bit |= 0xfu << (i * 4); } + if (sctx->family <= CHIP_NAVI14 && logicop_enable) + blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit; + if (blend->cb_target_mask) { color_control |= S_028808_MODE(mode); } else { color_control |= S_028808_MODE(V_028808_CB_DISABLE); } - if (sctx->screen->rbplus_allowed) { + if (sctx->screen->info.rbplus_allowed) { /* Disable RB+ blend optimizations for dual source blending. * Vulkan does this. */ @@ -641,7 +657,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, sx_mrt_blend_opt[i]); /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */ - if (blend->dual_src_blend || state->logicop_enable || + if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE) color_control |= S_028808_DISABLE_DUAL_QUAD(1); } @@ -662,21 +678,19 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) struct si_state_blend *old_blend = sctx->queued.named.blend; struct si_state_blend *blend = (struct si_state_blend *)state; - if (!state) - return; + if (!blend) + blend = (struct si_state_blend *)sctx->noop_blend; - si_pm4_bind_state(sctx, blend, state); + si_pm4_bind_state(sctx, blend, blend); - if (!old_blend || - old_blend->cb_target_mask != blend->cb_target_mask || + if (old_blend->cb_target_mask != blend->cb_target_mask || old_blend->dual_src_blend != blend->dual_src_blend || (old_blend->blend_enable_4bit != blend->blend_enable_4bit && sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed)) si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - if (!old_blend || - old_blend->cb_target_mask != blend->cb_target_mask || + if (old_blend->cb_target_mask != blend->cb_target_mask || old_blend->alpha_to_coverage != blend->alpha_to_coverage || old_blend->alpha_to_one != blend->alpha_to_one || old_blend->dual_src_blend != blend->dual_src_blend || @@ -685,15 +699,13 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) sctx->do_update_shaders = true; if (sctx->screen->dpbb_allowed && - (!old_blend || - old_blend->alpha_to_coverage != blend->alpha_to_coverage || + (old_blend->alpha_to_coverage != blend->alpha_to_coverage || old_blend->blend_enable_4bit != blend->blend_enable_4bit || old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); if (sctx->screen->has_out_of_order_rast && - (!old_blend || - (old_blend->blend_enable_4bit != blend->blend_enable_4bit || + ((old_blend->blend_enable_4bit != blend->blend_enable_4bit || old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || old_blend->commutative_4bit != blend->commutative_4bit || old_blend->logicop_enable != blend->logicop_enable))) @@ -703,6 +715,10 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) static void si_delete_blend_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + + if (sctx->queued.named.blend == state) + si_bind_blend_state(ctx, sctx->noop_blend); + si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state); } @@ -763,7 +779,7 @@ static void si_emit_clip_regs(struct si_context *sctx) { struct si_shader *vs = si_get_vs_state(sctx); struct si_shader_selector *vs_sel = vs->selector; - struct tgsi_shader_info *info = &vs_sel->info; + struct si_shader_info *info = &vs_sel->info; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; @@ -789,12 +805,20 @@ static void si_emit_clip_regs(struct si_context *sctx) culldist_mask |= clipdist_mask; unsigned initial_cdw = sctx->gfx_cs->current.cdw; - radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL, - vs_sel->pa_cl_vs_out_cntl | - S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | - clipdist_mask | (culldist_mask << 8)); + unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | + S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | + clipdist_mask | (culldist_mask << 8); + + if (sctx->chip_class >= GFX10) { + radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, + pa_cl_cntl, + ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); + } else { + radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, + vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); + } radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, rs->pa_cl_clip_cntl | @@ -812,7 +836,7 @@ static void si_update_poly_offset_state(struct si_context *sctx) { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { + if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { si_pm4_bind_state(sctx, poly_offset, NULL); return; } @@ -892,8 +916,17 @@ static void *si_create_rs_state(struct pipe_context *ctx, rs->clamp_fragment_color = state->clamp_fragment_color; rs->clamp_vertex_color = state->clamp_vertex_color; rs->flatshade = state->flatshade; + rs->flatshade_first = state->flatshade_first; rs->sprite_coord_enable = state->sprite_coord_enable; rs->rasterizer_discard = state->rasterizer_discard; + rs->polygon_mode_enabled = (state->fill_front != PIPE_POLYGON_MODE_FILL && + !(state->cull_face & PIPE_FACE_FRONT)) || + (state->fill_back != PIPE_POLYGON_MODE_FILL && + !(state->cull_face & PIPE_FACE_BACK)); + rs->polygon_mode_is_lines = (state->fill_front == PIPE_POLYGON_MODE_LINE && + !(state->cull_face & PIPE_FACE_FRONT)) || + (state->fill_back == PIPE_POLYGON_MODE_LINE && + !(state->cull_face & PIPE_FACE_BACK)); rs->pa_sc_line_stipple = state->line_stipple_enable ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0; @@ -951,8 +984,7 @@ static void *si_create_rs_state(struct pipe_context *ctx, S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) | S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) | S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) | - S_028814_POLY_MODE(state->fill_front != PIPE_POLYGON_MODE_FILL || - state->fill_back != PIPE_POLYGON_MODE_FILL) | + S_028814_POLY_MODE(rs->polygon_mode_enabled) | S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back))); @@ -1014,14 +1046,14 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) (struct si_state_rasterizer*)sctx->queued.named.rasterizer; struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; - if (!state) - return; + if (!rs) + rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state; - if (!old_rs || old_rs->multisample_enable != rs->multisample_enable) { + if (old_rs->multisample_enable != rs->multisample_enable) { si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); /* Update the small primitive filter workaround if necessary. */ - if (sctx->screen->has_msaa_sample_loc_bug && + if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); } @@ -1032,30 +1064,22 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) si_pm4_bind_state(sctx, rasterizer, rs); si_update_poly_offset_state(sctx); - if (!old_rs || - old_rs->scissor_enable != rs->scissor_enable) + if (old_rs->scissor_enable != rs->scissor_enable) si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - if (!old_rs || - old_rs->line_width != rs->line_width || + if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size || old_rs->half_pixel_center != rs->half_pixel_center) si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); - if (!old_rs || - old_rs->clip_halfz != rs->clip_halfz) + if (old_rs->clip_halfz != rs->clip_halfz) si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); - if (!old_rs || - old_rs->clip_plane_enable != rs->clip_plane_enable || + if (old_rs->clip_plane_enable != rs->clip_plane_enable || old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); - sctx->ia_multi_vgt_param_key.u.line_stipple_enabled = - rs->line_stipple_enable; - - if (!old_rs || - old_rs->clip_plane_enable != rs->clip_plane_enable || + if (old_rs->clip_plane_enable != rs->clip_plane_enable || old_rs->rasterizer_discard != rs->rasterizer_discard || old_rs->sprite_coord_enable != rs->sprite_coord_enable || old_rs->flatshade != rs->flatshade || @@ -1075,7 +1099,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; if (sctx->queued.named.rasterizer == state) - si_pm4_bind_state(sctx, poly_offset, NULL); + si_bind_rs_state(ctx, sctx->discard_rasterizer_state); FREE(rs->pm4_poly_offset); si_pm4_delete_state(sctx, rasterizer, rs); @@ -1290,8 +1314,8 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) struct si_state_dsa *old_dsa = sctx->queued.named.dsa; struct si_state_dsa *dsa = state; - if (!state) - return; + if (!dsa) + dsa = (struct si_state_dsa *)sctx->noop_dsa; si_pm4_bind_state(sctx, dsa, dsa); @@ -1301,19 +1325,17 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } - if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func) + if (old_dsa->alpha_func != dsa->alpha_func) sctx->do_update_shaders = true; if (sctx->screen->dpbb_allowed && - (!old_dsa || - (old_dsa->depth_enabled != dsa->depth_enabled || + ((old_dsa->depth_enabled != dsa->depth_enabled || old_dsa->stencil_enabled != dsa->stencil_enabled || old_dsa->db_can_write != dsa->db_can_write))) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); if (sctx->screen->has_out_of_order_rast && - (!old_dsa || - memcmp(old_dsa->order_invariance, dsa->order_invariance, + (memcmp(old_dsa->order_invariance, dsa->order_invariance, sizeof(old_dsa->order_invariance)))) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } @@ -1321,6 +1343,10 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) static void si_delete_dsa_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + + if (sctx->queued.named.dsa == state) + si_bind_dsa_state(ctx, sctx->noop_dsa); + si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state); } @@ -1333,7 +1359,7 @@ static void *si_create_db_flush_dsa(struct si_context *sctx) /* DB RENDER STATE */ -static void si_set_active_query_state(struct pipe_context *ctx, boolean enable) +static void si_set_active_query_state(struct pipe_context *ctx, bool enable) { struct si_context *sctx = (struct si_context*)ctx; @@ -1421,6 +1447,7 @@ static void si_emit_db_render_state(struct si_context *sctx) if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) { bool perfect = sctx->num_perfect_occlusion_queries > 0; + bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect; if (sctx->chip_class >= GFX7) { unsigned log_sample_rate = sctx->framebuffer.log_samples; @@ -1433,6 +1460,7 @@ static void si_emit_db_render_state(struct si_context *sctx) db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) | + S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) | S_028004_SLICE_EVEN_ENABLE(1) | @@ -1474,8 +1502,8 @@ static void si_emit_db_render_state(struct si_context *sctx) if (!rs->multisample_enable) db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; - if (sctx->screen->has_rbplus && - !sctx->screen->rbplus_allowed) + if (sctx->screen->info.has_rbplus && + !sctx->screen->info.rbplus_allowed) db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, @@ -1971,7 +1999,7 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, /* GFX9 allocates 1D textures as 2D. */ if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) && - sscreen->info.chip_class >= GFX9 && + sscreen->info.chip_class == GFX9 && tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) { if (res_target == PIPE_TEXTURE_1D) res_target = PIPE_TEXTURE_2D; @@ -2199,12 +2227,12 @@ static bool si_is_zs_format_supported(enum pipe_format format) return si_translate_dbformat(format) != V_028040_Z_INVALID; } -static boolean si_is_format_supported(struct pipe_screen *screen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) +static bool si_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, + unsigned usage) { struct si_screen *sscreen = (struct si_screen *)screen; unsigned retval = 0; @@ -2214,6 +2242,13 @@ static boolean si_is_format_supported(struct pipe_screen *screen, return false; } + if (util_format_get_num_planes(format) >= 2) { + return util_format_planar_is_supported(screen, format, target, + sample_count, + storage_sample_count, + usage); + } + if (MAX2(1, sample_count) < MAX2(1, storage_sample_count)) return false; @@ -2221,9 +2256,6 @@ static boolean si_is_format_supported(struct pipe_screen *screen, if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE)) return false; - if (usage & PIPE_BIND_SHADER_IMAGE) - return false; - /* Only power-of-two sample counts are supported. */ if (!util_is_power_of_two_or_zero(sample_count) || !util_is_power_of_two_or_zero(storage_sample_count)) @@ -2511,7 +2543,7 @@ static void si_initialize_color_surface(struct si_context *sctx, color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments); - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { color_info |= S_028C70_COMPRESSION(1); unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh); @@ -2578,7 +2610,7 @@ static void si_initialize_color_surface(struct si_context *sctx, surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) | S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) | S_028EE0_RESOURCE_LEVEL(1); - } else if (sctx->chip_class >= GFX9) { + } else if (sctx->chip_class == GFX9) { color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level); color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) | S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type); @@ -2646,7 +2678,7 @@ static void si_init_depth_surface(struct si_context *sctx, surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1); - if (si_htile_enabled(tex, level)) { + if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1); @@ -2661,14 +2693,14 @@ static void si_init_depth_surface(struct si_context *sctx, if (sctx->chip_class >= GFX10) { z_info |= S_028040_ITERATE_FLUSH(1); - s_info |= S_028044_ITERATE_FLUSH(1); + s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled); } else { z_info |= S_028038_ITERATE_FLUSH(1); s_info |= S_02803C_ITERATE_FLUSH(1); } } - if (tex->surface.has_stencil) { + if (tex->surface.has_stencil && !tex->htile_stencil_disabled) { /* Stencil buffer workaround ported from the GFX6-GFX8 code. * See that for explanation. */ @@ -2679,7 +2711,7 @@ static void si_init_depth_surface(struct si_context *sctx, } surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->htile_offset) >> 8; + tex->surface.htile_offset) >> 8; surf->db_htile_surface = S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned); if (sctx->chip_class == GFX9) { @@ -2733,7 +2765,7 @@ static void si_init_depth_surface(struct si_context *sctx, surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1); - if (si_htile_enabled(tex, level)) { + if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) { z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1); @@ -2760,7 +2792,7 @@ static void si_init_depth_surface(struct si_context *sctx, } surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->htile_offset) >> 8; + tex->surface.htile_offset) >> 8; surf->db_htile_surface = S_028ABC_FULL_CACHE(1); if (tex->tc_compatible_htile) { @@ -2804,8 +2836,10 @@ void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; struct si_texture *tex = (struct si_texture*)surf->texture; - if (tex->fmask_offset) + if (tex->surface.fmask_offset) { tex->dirty_level_mask |= 1 << surf->u.tex.level; + tex->fmask_is_not_identity = true; + } if (tex->dcc_gather_statistics) tex->separate_dcc_dirty = true; } @@ -2954,6 +2988,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.compressed_cb_mask = 0; sctx->framebuffer.uncompressed_cb_mask = 0; + sctx->framebuffer.displayable_dcc_cb_mask = 0; sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state); sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples; sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples); @@ -2961,7 +2996,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.CB_has_shader_readable_metadata = false; sctx->framebuffer.DB_has_shader_readable_metadata = false; sctx->framebuffer.all_DCC_pipe_aligned = true; - unsigned num_bpp64_colorbufs = 0; + sctx->framebuffer.min_bytes_per_pixel = 0; for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) @@ -2989,11 +3024,14 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (surf->color_is_int10) sctx->framebuffer.color_is_int10 |= 1 << i; - if (tex->fmask_offset) + if (tex->surface.fmask_offset) sctx->framebuffer.compressed_cb_mask |= 1 << i; else sctx->framebuffer.uncompressed_cb_mask |= 1 << i; + if (tex->surface.dcc_offset) + sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i; + /* Don't update nr_color_samples for non-AA buffers. * (e.g. destination of MSAA resolve) */ @@ -3008,8 +3046,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (tex->surface.is_linear) sctx->framebuffer.any_dst_linear = true; - if (tex->surface.bpe >= 8) - num_bpp64_colorbufs++; if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { sctx->framebuffer.CB_has_shader_readable_metadata = true; @@ -3028,15 +3064,18 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.compressed_cb_mask |= 1 << i; vi_separate_dcc_start_query(sctx, tex); } + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe; } /* For optimal DCC performance. */ - if (sctx->chip_class == GFX8) - sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; - else if (num_bpp64_colorbufs >= 5) - sctx->framebuffer.dcc_overwrite_combiner_watermark = 8; - else + if (sctx->chip_class >= GFX10) sctx->framebuffer.dcc_overwrite_combiner_watermark = 6; + else + sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; struct si_texture *zstex = NULL; @@ -3048,10 +3087,16 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_init_depth_surface(sctx, surf); } - if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level)) + if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, + PIPE_MASK_ZS)) sctx->framebuffer.DB_has_shader_readable_metadata = true; si_context_add_resource_size(sctx, surf->base.texture); + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; } si_update_ps_colorbuf0_slot(sctx); @@ -3175,8 +3220,8 @@ static void si_emit_framebuffer_state(struct si_context *sctx) if (cb->base.u.tex.level > 0) cb_color_info &= C_028C70_FAST_CLEAR; - if (tex->fmask_offset) { - cb_color_fmask = (tex->buffer.gpu_address + tex->fmask_offset) >> 8; + if (tex->surface.fmask_offset) { + cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; cb_color_fmask |= tex->surface.fmask_tile_swizzle; } @@ -3191,7 +3236,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_color_info |= S_028C70_DCC_ENABLE(1); cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + - tex->dcc_offset) >> 8; + tex->surface.dcc_offset) >> 8; unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8; @@ -3204,7 +3249,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) /* Set mutable surface parameters. */ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) + if (!tex->surface.fmask_offset) cb_color_fmask = cb_color_base; if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; @@ -3243,10 +3288,10 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb->cb_color_attrib2); radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); - } else if (sctx->chip_class >= GFX9) { + } else if (sctx->chip_class == GFX9) { struct gfx9_surf_meta_flags meta; - if (tex->dcc_offset) + if (tex->surface.dcc_offset) meta = tex->surface.u.gfx9.dcc; else meta = tex->surface.u.gfx9.cmask; @@ -3254,7 +3299,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) /* Set mutable surface parameters. */ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) + if (!tex->surface.fmask_offset) cb_color_fmask = cb_color_base; if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; @@ -3294,7 +3339,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) if (level_info->mode == RADEON_SURF_MODE_2D) cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) + if (!tex->surface.fmask_offset) cb_color_fmask = cb_color_base; if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; @@ -3310,7 +3355,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max); cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { if (sctx->chip_class >= GFX7) cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1); cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index); @@ -3378,7 +3423,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ - } else if (sctx->chip_class >= GFX9) { + } else if (sctx->chip_class == GFX9) { radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */ @@ -3451,7 +3496,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx) struct radeon_cmdbuf *cs = sctx->gfx_cs; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned nr_samples = sctx->framebuffer.nr_samples; - bool has_msaa_sample_loc_bug = sctx->screen->has_msaa_sample_loc_bug; + bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug; /* Smoothing (only possible with nr_samples == 1) uses the same * sample locations as the MSAA it simulates. @@ -3514,11 +3559,7 @@ static bool si_out_of_order_rasterization(struct si_context *sctx) unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; - if (blend) { - colormask &= blend->cb_target_enabled_4bit; - } else { - colormask = 0; - } + colormask &= blend->cb_target_enabled_4bit; /* Conservative: No logic op. */ if (colormask && blend->logicop_enable) @@ -3790,14 +3831,7 @@ si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units. * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE. */ - if (screen->info.chip_class == GFX9 && HAVE_LLVM < 0x0800) - /* When vindex == 0, LLVM < 8.0 sets IDXEN = 0, thus changing units - * from STRIDE to bytes. This works around it by setting - * NUM_RECORDS to at least the size of one element, so that - * the first element is readable when IDXEN == 0. - */ - num_records = num_records ? MAX2(num_records, stride) : 0; - else if (screen->info.chip_class == GFX8) + if (screen->info.chip_class == GFX8) num_records *= stride; state[4] = 0; @@ -3819,7 +3853,7 @@ si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, * else: swizzle_address >= NUM_RECORDS */ state[7] |= S_008F0C_FORMAT(fmt->img_format) | - S_008F0C_OOB_SELECT(0) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | S_008F0C_RESOURCE_LEVEL(1); } else { int first_non_void; @@ -3906,7 +3940,7 @@ gfx10_make_texture_descriptor(struct si_screen *screen, /* * X24S8 is implemented as an 8_8_8_8 data format, to * fix texture gathers. This affects at least - * GL45-CTS.texture_cube_map_array.sampling on VI. + * GL45-CTS.texture_cube_map_array.sampling on GFX8. */ util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle); is_stencil = true; @@ -3976,17 +4010,17 @@ gfx10_make_texture_descriptor(struct si_screen *screen, state[6] = 0; state[7] = 0; - if (tex->dcc_offset) { + if (tex->surface.dcc_offset) { state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | - S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(pipe_format)); + S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); } /* Initialize the sampler view for FMASK. */ - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { uint32_t format; - va = tex->buffer.gpu_address + tex->fmask_offset; + va = tex->buffer.gpu_address + tex->surface.fmask_offset; #define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) switch (FMASK(res->nr_samples, res->nr_storage_samples)) { @@ -4186,7 +4220,7 @@ si_make_texture_descriptor(struct si_screen *screen, } /* S8 with Z32 HTILE needs a special format. */ - if (screen->info.chip_class >= GFX9 && + if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT && tex->tc_compatible_htile) data_format = V_008F14_IMG_DATA_FORMAT_S8_32; @@ -4238,7 +4272,7 @@ si_make_texture_descriptor(struct si_screen *screen, state[6] = 0; state[7] = 0; - if (screen->info.chip_class >= GFX9) { + if (screen->info.chip_class == GFX9) { unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle); /* Depth is the the last accessible layer on Gfx9. @@ -4259,8 +4293,8 @@ si_make_texture_descriptor(struct si_screen *screen, state[5] |= S_008F24_LAST_ARRAY(last_layer); } - if (tex->dcc_offset) { - state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(pipe_format)); + if (tex->surface.dcc_offset) { + state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); } else { /* The last dword is unused by hw. The shader uses it to clear * bits in the first dword of sampler state. @@ -4274,13 +4308,13 @@ si_make_texture_descriptor(struct si_screen *screen, } /* Initialize the sampler view for FMASK. */ - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { uint32_t data_format, num_format; - va = tex->buffer.gpu_address + tex->fmask_offset; + va = tex->buffer.gpu_address + tex->surface.fmask_offset; #define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) - if (screen->info.chip_class >= GFX9) { + if (screen->info.chip_class == GFX9) { data_format = V_008F14_IMG_DATA_FORMAT_FMASK; switch (FMASK(res->nr_samples, res->nr_storage_samples)) { case FMASK(2,1): @@ -4389,7 +4423,7 @@ si_make_texture_descriptor(struct si_screen *screen, fmask_state[6] = 0; fmask_state[7] = 0; - if (screen->info.chip_class >= GFX9) { + if (screen->info.chip_class == GFX9) { fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode); fmask_state[4] |= S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch); @@ -4839,7 +4873,10 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, return NULL; v->count = count; - v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT); + + unsigned alloc_count = count > sscreen->num_vbos_in_user_sgprs ? + count - sscreen->num_vbos_in_user_sgprs : 0; + v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT); for (i = 0; i < count; ++i) { const struct util_format_description *desc; @@ -5040,7 +5077,14 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) struct si_vertex_elements *v = (struct si_vertex_elements*)state; sctx->vertex_elements = v; - sctx->vertex_buffers_dirty = true; + sctx->num_vertex_elements = v ? v->count : 0; + + if (sctx->num_vertex_elements) { + sctx->vertex_buffers_dirty = true; + } else { + sctx->vertex_buffer_pointer_dirty = false; + sctx->vertex_buffer_user_sgprs_dirty = false; + } if (v && (!old || @@ -5076,8 +5120,10 @@ static void si_delete_vertex_element(struct pipe_context *ctx, void *state) struct si_context *sctx = (struct si_context *)ctx; struct si_vertex_elements *v = (struct si_vertex_elements*)state; - if (sctx->vertex_elements == state) + if (sctx->vertex_elements == state) { sctx->vertex_elements = NULL; + sctx->num_vertex_elements = 0; + } si_resource_reference(&v->instance_divisor_factor_buffer, NULL); FREE(state); } @@ -5088,8 +5134,9 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, { struct si_context *sctx = (struct si_context *)ctx; struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; + unsigned updated_mask = u_bit_consecutive(start_slot, count); uint32_t orig_unaligned = sctx->vertex_buffer_unaligned; - uint32_t unaligned = orig_unaligned; + uint32_t unaligned = 0; int i; assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer)); @@ -5099,14 +5146,14 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, const struct pipe_vertex_buffer *src = buffers + i; struct pipe_vertex_buffer *dsti = dst + i; struct pipe_resource *buf = src->buffer.resource; + unsigned slot_bit = 1 << (start_slot + i); pipe_resource_reference(&dsti->buffer.resource, buf); dsti->buffer_offset = src->buffer_offset; dsti->stride = src->stride; + if (dsti->buffer_offset & 3 || dsti->stride & 3) - unaligned |= 1 << (start_slot + i); - else - unaligned &= ~(1 << (start_slot + i)); + unaligned |= slot_bit; si_context_add_resource_size(sctx, buf); if (buf) @@ -5116,10 +5163,10 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, for (i = 0; i < count; i++) { pipe_resource_reference(&dst[i].buffer.resource, NULL); } - unaligned &= ~u_bit_consecutive(start_slot, count); + unaligned &= ~updated_mask; } sctx->vertex_buffers_dirty = true; - sctx->vertex_buffer_unaligned = unaligned; + sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned; /* Check whether alignment may have changed in a way that requires * shader changes. This check is conservative: a vertex buffer can only @@ -5130,7 +5177,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, */ if (sctx->vertex_elements && (sctx->vertex_elements->vb_alignment_check_mask & - (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count))) + (unaligned | orig_unaligned) & updated_mask)) sctx->do_update_shaders = true; } @@ -5386,13 +5433,9 @@ static void si_init_config(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; uint64_t border_color_va = sctx->border_color_buffer->gpu_address; - bool has_clear_state = sscreen->has_clear_state; + bool has_clear_state = sscreen->info.has_clear_state; struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); - /* GFX6, radeon kernel disabled CLEAR_STATE. */ - assert(has_clear_state || sscreen->info.chip_class == GFX6 || - !sscreen->info.is_amdgpu); - if (!pm4) return; @@ -5426,22 +5469,25 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); } - si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); + if (sscreen->info.chip_class <= GFX9) + si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); if (!has_clear_state) si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); if (sctx->chip_class < GFX7) si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); + /* CLEAR_STATE doesn't restore these correctly. */ + si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); + si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, + S_028244_BR_X(16384) | S_028244_BR_Y(16384)); + /* CLEAR_STATE doesn't clear these correctly on certain generations. * I don't know why. Deduced by trial and error. */ - if (sctx->chip_class <= GFX7) { + if (sctx->chip_class <= GFX7 || !has_clear_state) { si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, - S_028244_BR_X(16384) | S_028244_BR_Y(16384)); si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR, S_028034_BR_X(16384) | S_028034_BR_Y(16384)); @@ -5465,10 +5511,13 @@ static void si_init_config(struct si_context *sctx) } if (sctx->chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); - } else if (sctx->chip_class >= GFX9) { + si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); + si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); + } else if (sctx->chip_class == GFX9) { si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); @@ -5487,9 +5536,6 @@ static void si_init_config(struct si_context *sctx) /* Logical CUs 16 - 31 */ si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | - S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff)); si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, @@ -5515,8 +5561,6 @@ static void si_init_config(struct si_context *sctx) S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); } - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); /* Compute LATE_ALLOC_VS.LIMIT. */ unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; @@ -5540,13 +5584,44 @@ static void si_init_config(struct si_context *sctx) late_alloc_limit = (num_cu_per_sh - 2) * 4; } + unsigned late_alloc_limit_gs = late_alloc_limit; + unsigned cu_mask_vs = 0xffff; + unsigned cu_mask_gs = 0xffff; + + if (late_alloc_limit > 2) { + if (sctx->chip_class >= GFX10) { + /* CU2 & CU3 disabled because of the dual CU design */ + cu_mask_vs = 0xfff3; + cu_mask_gs = 0xfff3; /* NGG only */ + } else { + cu_mask_vs = 0xfffe; /* 1 CU disabled */ + } + } + + /* Don't use late alloc for NGG on Navi14 due to a hw bug. + * If NGG is never used, enable all CUs. + */ + if (!sscreen->use_ngg || sctx->family == CHIP_NAVI14) { + late_alloc_limit_gs = 0; + cu_mask_gs = 0xffff; + } + /* VS can't execute on one CU if the limit is > 2. */ si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, - S_00B118_CU_EN(late_alloc_limit > 2 ? 0xfffe : 0xffff) | + S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F)); si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_limit)); + si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); + + if (sctx->chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_limit_gs)); + } + si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); } @@ -5563,7 +5638,50 @@ static void si_init_config(struct si_context *sctx) */ si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512)); - si_pm4_set_reg(pm4, R_028838_PA_CL_NGG_CNTL, 0); /* TODO edge flags? */ + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, + sscreen->info.pa_sc_tile_steering_override); + } + + si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, + S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_HTILE_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) | + S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) | + S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) | + S_02807C_HTILE_RD_POLICY(V_02807C_CACHE_NOA_RD)); + + si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, + S_028410_CMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_FMASK_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_DCC_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) | + S_028410_CMASK_RD_POLICY(V_028410_CACHE_NOA_RD) | + S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA_RD) | + S_028410_DCC_RD_POLICY(V_028410_CACHE_NOA_RD) | + S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD)); + si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); + + si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, + S_00B0C0_SOFT_GROUPING_EN(1) | + S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); + si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); + + if (sctx->family == CHIP_NAVI10 || + sctx->family == CHIP_NAVI12 || + sctx->family == CHIP_NAVI14) { + /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ + si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); + si_pm4_cmd_end(pm4, false); + } + /* TODO: For culling, replace 128 with 256. */ + si_pm4_set_reg(pm4, R_030980_GE_PC_ALLOC, + S_030980_OVERSUB_EN(1) | + S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1)); } if (sctx->chip_class >= GFX8) { @@ -5597,37 +5715,8 @@ static void si_init_config(struct si_context *sctx) RADEON_PRIO_BORDER_COLORS); if (sctx->chip_class >= GFX9) { - unsigned num_se = sscreen->info.max_se; - unsigned pc_lines = 0; - unsigned max_alloc_count = 0; - - switch (sctx->family) { - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - pc_lines = 2048; - break; - case CHIP_RAVEN: - case CHIP_RAVEN2: - case CHIP_NAVI10: - case CHIP_NAVI12: - pc_lines = 1024; - break; - case CHIP_NAVI14: - pc_lines = 512; - break; - default: - assert(0); - } - - if (sctx->chip_class >= GFX10) { - max_alloc_count = pc_lines / 3; - } else { - max_alloc_count = MIN2(128, pc_lines / (4 * num_se)); - } - si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, - S_028C48_MAX_ALLOC_COUNT(max_alloc_count) | + S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | S_028C48_MAX_PRIM_PER_BATCH(1023)); si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));