X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state.c;h=8e6dd3c335bae8cce16a909e6636bf410eef3a49;hb=2c62b461e99c0d1d40d5aa16acfdd3df2fb790cd;hp=c96f7afc1bec38d6339fb341630c9008d6cb8bae;hpb=244a8e6798242427cc478a741d4b610c7eb2f954;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index c96f7afc1be..8e6dd3c335b 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -82,19 +82,17 @@ static void si_emit_cb_render_state(struct si_context *sctx) struct si_state_blend *blend = sctx->queued.named.blend; /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers, * but you never know. */ - uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit; + uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & + blend->cb_target_mask; unsigned i; - if (blend) - cb_target_mask &= blend->cb_target_mask; - /* Avoid a hang that happens when dual source blending is enabled * but there is not enough color outputs. This is undefined behavior, * so disable color writes completely. * * Reproducible with Unigine Heaven 4.0 and drirc missing. */ - if (blend && blend->dual_src_blend && + if (blend->dual_src_blend && sctx->ps_shader.cso && (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) cb_target_mask = 0; @@ -102,12 +100,12 @@ static void si_emit_cb_render_state(struct si_context *sctx) /* GFX9: Flush DFSM when CB_TARGET_MASK changes. * I think we don't have to do anything between IBs. */ - if (sctx->screen->dfsm_allowed && + if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) { sctx->last_cb_target_mask = cb_target_mask; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } unsigned initial_cdw = cs->current.cdw; @@ -115,26 +113,25 @@ static void si_emit_cb_render_state(struct si_context *sctx) SI_TRACKED_CB_TARGET_MASK, cb_target_mask); if (sctx->chip_class >= GFX8) { - /* DCC MSAA workaround for blending. + /* DCC MSAA workaround. * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_- * COMBINER_DISABLE, but that would be more complicated. */ - bool oc_disable = blend && - blend->blend_enable_4bit & cb_target_mask && + bool oc_disable = blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2; unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark; radeon_opt_set_context_reg( sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, - S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) | + S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) | S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) | S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) | - S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->has_dcc_constant_encode)); + S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode)); } /* RB+ register settings. */ - if (sctx->screen->rbplus_allowed) { + if (sctx->screen->info.rbplus_allowed) { unsigned spi_shader_col_format = sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0; @@ -148,8 +145,15 @@ static void si_emit_cb_render_state(struct si_context *sctx) unsigned format, swap, spi_format, colormask; bool has_alpha, has_rgb; - if (!surf) + if (!surf) { + /* If the color buffer is not set, the driver sets 32_R + * as the SPI color format, because the hw doesn't allow + * holes between color outputs, so also set this to + * enable RB+. + */ + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4); continue; + } format = G_028C70_FORMAT(surf->cb_color_info); swap = G_028C70_COMP_SWAP(surf->cb_color_info); @@ -243,10 +247,8 @@ static void si_emit_cb_render_state(struct si_context *sctx) break; case V_028C70_COLOR_10_11_11: - if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) { + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4); - sx_blend_opt_epsilon |= V_028758_11BIT_FORMAT << (i * 4); - } break; case V_028C70_COLOR_2_10_10_10: @@ -258,6 +260,12 @@ static void si_emit_cb_render_state(struct si_context *sctx) } } + /* If there are no color outputs, the first color export is + * always enabled as 32_R, so also set this to enable RB+. + */ + if (!sx_ps_downconvert) + sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; + /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */ radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, @@ -466,6 +474,8 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, struct si_pm4_state *pm4 = &blend->pm4; uint32_t sx_mrt_blend_opt[8] = {0}; uint32_t color_control = 0; + bool logicop_enable = state->logicop_enable && + state->logicop_func != PIPE_LOGICOP_COPY; if (!blend) return NULL; @@ -473,9 +483,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->alpha_to_coverage = state->alpha_to_coverage; blend->alpha_to_one = state->alpha_to_one; blend->dual_src_blend = util_blend_state_is_dual(state, 0); - blend->logicop_enable = state->logicop_enable; + blend->logicop_enable = logicop_enable; - if (state->logicop_enable) { + if (logicop_enable) { color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); } else { color_control |= S_028808_ROP3(0xcc); @@ -608,6 +618,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->blend_enable_4bit |= 0xfu << (i * 4); + if (sctx->family <= CHIP_NAVI14) + blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4); + /* This is only important for formats without alpha. */ if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA || @@ -618,13 +631,16 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, blend->need_src_alpha_4bit |= 0xfu << (i * 4); } + if (sctx->family <= CHIP_NAVI14 && logicop_enable) + blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit; + if (blend->cb_target_mask) { color_control |= S_028808_MODE(mode); } else { color_control |= S_028808_MODE(V_028808_CB_DISABLE); } - if (sctx->screen->rbplus_allowed) { + if (sctx->screen->info.rbplus_allowed) { /* Disable RB+ blend optimizations for dual source blending. * Vulkan does this. */ @@ -641,7 +657,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, sx_mrt_blend_opt[i]); /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */ - if (blend->dual_src_blend || state->logicop_enable || + if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE) color_control |= S_028808_DISABLE_DUAL_QUAD(1); } @@ -662,21 +678,19 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) struct si_state_blend *old_blend = sctx->queued.named.blend; struct si_state_blend *blend = (struct si_state_blend *)state; - if (!state) - return; + if (!blend) + blend = (struct si_state_blend *)sctx->noop_blend; - si_pm4_bind_state(sctx, blend, state); + si_pm4_bind_state(sctx, blend, blend); - if (!old_blend || - old_blend->cb_target_mask != blend->cb_target_mask || + if (old_blend->cb_target_mask != blend->cb_target_mask || old_blend->dual_src_blend != blend->dual_src_blend || (old_blend->blend_enable_4bit != blend->blend_enable_4bit && sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed)) si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state); - if (!old_blend || - old_blend->cb_target_mask != blend->cb_target_mask || + if (old_blend->cb_target_mask != blend->cb_target_mask || old_blend->alpha_to_coverage != blend->alpha_to_coverage || old_blend->alpha_to_one != blend->alpha_to_one || old_blend->dual_src_blend != blend->dual_src_blend || @@ -685,15 +699,13 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) sctx->do_update_shaders = true; if (sctx->screen->dpbb_allowed && - (!old_blend || - old_blend->alpha_to_coverage != blend->alpha_to_coverage || + (old_blend->alpha_to_coverage != blend->alpha_to_coverage || old_blend->blend_enable_4bit != blend->blend_enable_4bit || old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit)) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); if (sctx->screen->has_out_of_order_rast && - (!old_blend || - (old_blend->blend_enable_4bit != blend->blend_enable_4bit || + ((old_blend->blend_enable_4bit != blend->blend_enable_4bit || old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit || old_blend->commutative_4bit != blend->commutative_4bit || old_blend->logicop_enable != blend->logicop_enable))) @@ -703,6 +715,10 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state) static void si_delete_blend_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + + if (sctx->queued.named.blend == state) + si_bind_blend_state(ctx, sctx->noop_blend); + si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state); } @@ -789,12 +805,20 @@ static void si_emit_clip_regs(struct si_context *sctx) culldist_mask |= clipdist_mask; unsigned initial_cdw = sctx->gfx_cs->current.cdw; - radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, - SI_TRACKED_PA_CL_VS_OUT_CNTL, - vs_sel->pa_cl_vs_out_cntl | - S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | - clipdist_mask | (culldist_mask << 8)); + unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | + S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | + clipdist_mask | (culldist_mask << 8); + + if (sctx->chip_class >= GFX10) { + radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, + pa_cl_cntl, + ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); + } else { + radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, + SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, + vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl); + } radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, rs->pa_cl_clip_cntl | @@ -812,7 +836,7 @@ static void si_update_poly_offset_state(struct si_context *sctx) { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { + if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) { si_pm4_bind_state(sctx, poly_offset, NULL); return; } @@ -1015,14 +1039,14 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) (struct si_state_rasterizer*)sctx->queued.named.rasterizer; struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; - if (!state) - return; + if (!rs) + rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state; - if (!old_rs || old_rs->multisample_enable != rs->multisample_enable) { + if (old_rs->multisample_enable != rs->multisample_enable) { si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); /* Update the small primitive filter workaround if necessary. */ - if (sctx->screen->has_msaa_sample_loc_bug && + if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); } @@ -1033,30 +1057,25 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) si_pm4_bind_state(sctx, rasterizer, rs); si_update_poly_offset_state(sctx); - if (!old_rs || - old_rs->scissor_enable != rs->scissor_enable) + if (old_rs->scissor_enable != rs->scissor_enable) si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors); - if (!old_rs || - old_rs->line_width != rs->line_width || + if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size || old_rs->half_pixel_center != rs->half_pixel_center) si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband); - if (!old_rs || - old_rs->clip_halfz != rs->clip_halfz) + if (old_rs->clip_halfz != rs->clip_halfz) si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports); - if (!old_rs || - old_rs->clip_plane_enable != rs->clip_plane_enable || + if (old_rs->clip_plane_enable != rs->clip_plane_enable || old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl) si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); sctx->ia_multi_vgt_param_key.u.line_stipple_enabled = rs->line_stipple_enable; - if (!old_rs || - old_rs->clip_plane_enable != rs->clip_plane_enable || + if (old_rs->clip_plane_enable != rs->clip_plane_enable || old_rs->rasterizer_discard != rs->rasterizer_discard || old_rs->sprite_coord_enable != rs->sprite_coord_enable || old_rs->flatshade != rs->flatshade || @@ -1076,7 +1095,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state; if (sctx->queued.named.rasterizer == state) - si_pm4_bind_state(sctx, poly_offset, NULL); + si_bind_rs_state(ctx, sctx->discard_rasterizer_state); FREE(rs->pm4_poly_offset); si_pm4_delete_state(sctx, rasterizer, rs); @@ -1291,8 +1310,8 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) struct si_state_dsa *old_dsa = sctx->queued.named.dsa; struct si_state_dsa *dsa = state; - if (!state) - return; + if (!dsa) + dsa = (struct si_state_dsa *)sctx->noop_dsa; si_pm4_bind_state(sctx, dsa, dsa); @@ -1302,19 +1321,17 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref); } - if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func) + if (old_dsa->alpha_func != dsa->alpha_func) sctx->do_update_shaders = true; if (sctx->screen->dpbb_allowed && - (!old_dsa || - (old_dsa->depth_enabled != dsa->depth_enabled || + ((old_dsa->depth_enabled != dsa->depth_enabled || old_dsa->stencil_enabled != dsa->stencil_enabled || old_dsa->db_can_write != dsa->db_can_write))) si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state); if (sctx->screen->has_out_of_order_rast && - (!old_dsa || - memcmp(old_dsa->order_invariance, dsa->order_invariance, + (memcmp(old_dsa->order_invariance, dsa->order_invariance, sizeof(old_dsa->order_invariance)))) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config); } @@ -1322,6 +1339,10 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state) static void si_delete_dsa_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + + if (sctx->queued.named.dsa == state) + si_bind_dsa_state(ctx, sctx->noop_dsa); + si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state); } @@ -1334,7 +1355,7 @@ static void *si_create_db_flush_dsa(struct si_context *sctx) /* DB RENDER STATE */ -static void si_set_active_query_state(struct pipe_context *ctx, boolean enable) +static void si_set_active_query_state(struct pipe_context *ctx, bool enable) { struct si_context *sctx = (struct si_context*)ctx; @@ -1422,6 +1443,7 @@ static void si_emit_db_render_state(struct si_context *sctx) if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) { bool perfect = sctx->num_perfect_occlusion_queries > 0; + bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect; if (sctx->chip_class >= GFX7) { unsigned log_sample_rate = sctx->framebuffer.log_samples; @@ -1434,6 +1456,7 @@ static void si_emit_db_render_state(struct si_context *sctx) db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) | + S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) | S_028004_SLICE_EVEN_ENABLE(1) | @@ -1475,8 +1498,8 @@ static void si_emit_db_render_state(struct si_context *sctx) if (!rs->multisample_enable) db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; - if (sctx->screen->has_rbplus && - !sctx->screen->rbplus_allowed) + if (sctx->screen->info.has_rbplus && + !sctx->screen->info.rbplus_allowed) db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, @@ -1972,7 +1995,7 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, /* GFX9 allocates 1D textures as 2D. */ if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) && - sscreen->info.chip_class >= GFX9 && + sscreen->info.chip_class == GFX9 && tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) { if (res_target == PIPE_TEXTURE_1D) res_target = PIPE_TEXTURE_2D; @@ -2200,12 +2223,12 @@ static bool si_is_zs_format_supported(enum pipe_format format) return si_translate_dbformat(format) != V_028040_Z_INVALID; } -static boolean si_is_format_supported(struct pipe_screen *screen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) +static bool si_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, + unsigned usage) { struct si_screen *sscreen = (struct si_screen *)screen; unsigned retval = 0; @@ -2512,7 +2535,7 @@ static void si_initialize_color_surface(struct si_context *sctx, color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments); - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { color_info |= S_028C70_COMPRESSION(1); unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh); @@ -2680,7 +2703,7 @@ static void si_init_depth_surface(struct si_context *sctx, } surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->htile_offset) >> 8; + tex->surface.htile_offset) >> 8; surf->db_htile_surface = S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned); if (sctx->chip_class == GFX9) { @@ -2761,7 +2784,7 @@ static void si_init_depth_surface(struct si_context *sctx, } surf->db_htile_data_base = (tex->buffer.gpu_address + - tex->htile_offset) >> 8; + tex->surface.htile_offset) >> 8; surf->db_htile_surface = S_028ABC_FULL_CACHE(1); if (tex->tc_compatible_htile) { @@ -2805,7 +2828,7 @@ void si_update_fb_dirtiness_after_rendering(struct si_context *sctx) struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i]; struct si_texture *tex = (struct si_texture*)surf->texture; - if (tex->fmask_offset) + if (tex->surface.fmask_offset) tex->dirty_level_mask |= 1 << surf->u.tex.level; if (tex->dcc_gather_statistics) tex->separate_dcc_dirty = true; @@ -2962,7 +2985,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.CB_has_shader_readable_metadata = false; sctx->framebuffer.DB_has_shader_readable_metadata = false; sctx->framebuffer.all_DCC_pipe_aligned = true; - unsigned num_bpp64_colorbufs = 0; + sctx->framebuffer.min_bytes_per_pixel = 0; for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) @@ -2990,7 +3013,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (surf->color_is_int10) sctx->framebuffer.color_is_int10 |= 1 << i; - if (tex->fmask_offset) + if (tex->surface.fmask_offset) sctx->framebuffer.compressed_cb_mask |= 1 << i; else sctx->framebuffer.uncompressed_cb_mask |= 1 << i; @@ -3009,8 +3032,6 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (tex->surface.is_linear) sctx->framebuffer.any_dst_linear = true; - if (tex->surface.bpe >= 8) - num_bpp64_colorbufs++; if (vi_dcc_enabled(tex, surf->base.u.tex.level)) { sctx->framebuffer.CB_has_shader_readable_metadata = true; @@ -3029,15 +3050,18 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.compressed_cb_mask |= 1 << i; vi_separate_dcc_start_query(sctx, tex); } + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe; } /* For optimal DCC performance. */ - if (sctx->chip_class == GFX8) - sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; - else if (num_bpp64_colorbufs >= 5) - sctx->framebuffer.dcc_overwrite_combiner_watermark = 8; - else + if (sctx->chip_class >= GFX10) sctx->framebuffer.dcc_overwrite_combiner_watermark = 6; + else + sctx->framebuffer.dcc_overwrite_combiner_watermark = 4; struct si_texture *zstex = NULL; @@ -3054,6 +3078,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.DB_has_shader_readable_metadata = true; si_context_add_resource_size(sctx, surf->base.texture); + + /* Update the minimum but don't keep 0. */ + if (!sctx->framebuffer.min_bytes_per_pixel || + zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel) + sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe; } si_update_ps_colorbuf0_slot(sctx); @@ -3177,8 +3206,8 @@ static void si_emit_framebuffer_state(struct si_context *sctx) if (cb->base.u.tex.level > 0) cb_color_info &= C_028C70_FAST_CLEAR; - if (tex->fmask_offset) { - cb_color_fmask = (tex->buffer.gpu_address + tex->fmask_offset) >> 8; + if (tex->surface.fmask_offset) { + cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8; cb_color_fmask |= tex->surface.fmask_tile_swizzle; } @@ -3193,7 +3222,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_color_info |= S_028C70_DCC_ENABLE(1); cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + - tex->dcc_offset) >> 8; + tex->surface.dcc_offset) >> 8; unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8; @@ -3206,7 +3235,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) /* Set mutable surface parameters. */ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) + if (!tex->surface.fmask_offset) cb_color_fmask = cb_color_base; if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; @@ -3248,7 +3277,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) } else if (sctx->chip_class == GFX9) { struct gfx9_surf_meta_flags meta; - if (tex->dcc_offset) + if (tex->surface.dcc_offset) meta = tex->surface.u.gfx9.dcc; else meta = tex->surface.u.gfx9.cmask; @@ -3256,7 +3285,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) /* Set mutable surface parameters. */ cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) + if (!tex->surface.fmask_offset) cb_color_fmask = cb_color_base; if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; @@ -3296,7 +3325,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) if (level_info->mode == RADEON_SURF_MODE_2D) cb_color_base |= tex->surface.tile_swizzle; - if (!tex->fmask_offset) + if (!tex->surface.fmask_offset) cb_color_fmask = cb_color_base; if (cb->base.u.tex.level > 0) cb_color_cmask = cb_color_base; @@ -3312,7 +3341,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max); cb_color_slice = S_028C68_TILE_MAX(slice_tile_max); - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { if (sctx->chip_class >= GFX7) cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1); cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index); @@ -3453,7 +3482,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx) struct radeon_cmdbuf *cs = sctx->gfx_cs; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned nr_samples = sctx->framebuffer.nr_samples; - bool has_msaa_sample_loc_bug = sctx->screen->has_msaa_sample_loc_bug; + bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug; /* Smoothing (only possible with nr_samples == 1) uses the same * sample locations as the MSAA it simulates. @@ -3516,11 +3545,7 @@ static bool si_out_of_order_rasterization(struct si_context *sctx) unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit; - if (blend) { - colormask &= blend->cb_target_enabled_4bit; - } else { - colormask = 0; - } + colormask &= blend->cb_target_enabled_4bit; /* Conservative: No logic op. */ if (colormask && blend->logicop_enable) @@ -3792,14 +3817,7 @@ si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf, * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units. * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE. */ - if (screen->info.chip_class == GFX9 && HAVE_LLVM < 0x0800) - /* When vindex == 0, LLVM < 8.0 sets IDXEN = 0, thus changing units - * from STRIDE to bytes. This works around it by setting - * NUM_RECORDS to at least the size of one element, so that - * the first element is readable when IDXEN == 0. - */ - num_records = num_records ? MAX2(num_records, stride) : 0; - else if (screen->info.chip_class == GFX8) + if (screen->info.chip_class == GFX8) num_records *= stride; state[4] = 0; @@ -3978,17 +3996,17 @@ gfx10_make_texture_descriptor(struct si_screen *screen, state[6] = 0; state[7] = 0; - if (tex->dcc_offset) { + if (tex->surface.dcc_offset) { state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) | S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); } /* Initialize the sampler view for FMASK. */ - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { uint32_t format; - va = tex->buffer.gpu_address + tex->fmask_offset; + va = tex->buffer.gpu_address + tex->surface.fmask_offset; #define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) switch (FMASK(res->nr_samples, res->nr_storage_samples)) { @@ -4261,7 +4279,7 @@ si_make_texture_descriptor(struct si_screen *screen, state[5] |= S_008F24_LAST_ARRAY(last_layer); } - if (tex->dcc_offset) { + if (tex->surface.dcc_offset) { state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); } else { /* The last dword is unused by hw. The shader uses it to clear @@ -4276,10 +4294,10 @@ si_make_texture_descriptor(struct si_screen *screen, } /* Initialize the sampler view for FMASK. */ - if (tex->fmask_offset) { + if (tex->surface.fmask_offset) { uint32_t data_format, num_format; - va = tex->buffer.gpu_address + tex->fmask_offset; + va = tex->buffer.gpu_address + tex->surface.fmask_offset; #define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f))) if (screen->info.chip_class == GFX9) { @@ -5388,13 +5406,9 @@ static void si_init_config(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; uint64_t border_color_va = sctx->border_color_buffer->gpu_address; - bool has_clear_state = sscreen->has_clear_state; + bool has_clear_state = sscreen->info.has_clear_state; struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); - /* GFX6, radeon kernel disabled CLEAR_STATE. */ - assert(has_clear_state || sscreen->info.chip_class == GFX6 || - !sscreen->info.is_amdgpu); - if (!pm4) return; @@ -5428,7 +5442,8 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); } - si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); + if (sscreen->info.chip_class <= GFX9) + si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); if (!has_clear_state) si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); if (sctx->chip_class < GFX7) @@ -5438,7 +5453,7 @@ static void si_init_config(struct si_context *sctx) /* CLEAR_STATE doesn't clear these correctly on certain generations. * I don't know why. Deduced by trial and error. */ - if (sctx->chip_class <= GFX7) { + if (sctx->chip_class <= GFX7 || !has_clear_state) { si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); @@ -5467,9 +5482,12 @@ static void si_init_config(struct si_context *sctx) } if (sctx->chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); + si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); + si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); } else if (sctx->chip_class == GFX9) { si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); @@ -5489,9 +5507,6 @@ static void si_init_config(struct si_context *sctx) /* Logical CUs 16 - 31 */ si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - S_00B204_CU_EN(0xffff) | - S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0)); si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff)); si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, @@ -5517,8 +5532,6 @@ static void si_init_config(struct si_context *sctx) S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); } - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F)); /* Compute LATE_ALLOC_VS.LIMIT. */ unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; @@ -5542,13 +5555,44 @@ static void si_init_config(struct si_context *sctx) late_alloc_limit = (num_cu_per_sh - 2) * 4; } + unsigned late_alloc_limit_gs = late_alloc_limit; + unsigned cu_mask_vs = 0xffff; + unsigned cu_mask_gs = 0xffff; + + if (late_alloc_limit > 2) { + if (sctx->chip_class >= GFX10) { + /* CU2 & CU3 disabled because of the dual CU design */ + cu_mask_vs = 0xfff3; + cu_mask_gs = 0xfff3; /* NGG only */ + } else { + cu_mask_vs = 0xfffe; /* 1 CU disabled */ + } + } + + /* Don't use late alloc for NGG on Navi14 due to a hw bug. + * If NGG is never used, enable all CUs. + */ + if (!sscreen->use_ngg || sctx->family == CHIP_NAVI14) { + late_alloc_limit_gs = 0; + cu_mask_gs = 0xffff; + } + /* VS can't execute on one CU if the limit is > 2. */ si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, - S_00B118_CU_EN(late_alloc_limit > 2 ? 0xfffe : 0xffff) | + S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F)); si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_limit)); + si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); + + if (sctx->chip_class >= GFX10) { + si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + S_00B204_CU_EN(0xffff) | + S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_limit_gs)); + } + si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); } @@ -5565,8 +5609,12 @@ static void si_init_config(struct si_context *sctx) */ si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512)); - si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, - sscreen->info.pa_sc_tile_steering_override); + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, + sscreen->info.pa_sc_tile_steering_override); + } si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) | @@ -5586,6 +5634,25 @@ static void si_init_config(struct si_context *sctx) S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA_RD) | S_028410_DCC_RD_POLICY(V_028410_CACHE_NOA_RD) | S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD)); + si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); + + si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, + S_00B0C0_SOFT_GROUPING_EN(1) | + S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); + si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); + + if (sctx->family == CHIP_NAVI10 || + sctx->family == CHIP_NAVI12 || + sctx->family == CHIP_NAVI14) { + /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ + si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0)); + si_pm4_cmd_end(pm4, false); + } + /* TODO: For culling, replace 128 with 256. */ + si_pm4_set_reg(pm4, R_030980_GE_PC_ALLOC, + S_030980_OVERSUB_EN(1) | + S_030980_NUM_PC_LINES(128 * sscreen->info.max_se - 1)); } if (sctx->chip_class >= GFX8) { @@ -5619,37 +5686,8 @@ static void si_init_config(struct si_context *sctx) RADEON_PRIO_BORDER_COLORS); if (sctx->chip_class >= GFX9) { - unsigned num_se = sscreen->info.max_se; - unsigned pc_lines = 0; - unsigned max_alloc_count = 0; - - switch (sctx->family) { - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - pc_lines = 2048; - break; - case CHIP_RAVEN: - case CHIP_RAVEN2: - case CHIP_NAVI10: - case CHIP_NAVI12: - pc_lines = 1024; - break; - case CHIP_NAVI14: - pc_lines = 512; - break; - default: - assert(0); - } - - if (sctx->chip_class >= GFX10) { - max_alloc_count = pc_lines / 3; - } else { - max_alloc_count = MIN2(128, pc_lines / (4 * num_se)); - } - si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, - S_028C48_MAX_ALLOC_COUNT(max_alloc_count) | + S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | S_028C48_MAX_PRIM_PER_BATCH(1023)); si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));