X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_state.c;h=4c2e0c7a6c102af751ddd61722a621b4fcc9b771;hp=b9fea68ba0275316e959c5b09fcc65e9fa83ce8a;hb=64349a60e17a03de4bb7e03d942bfc1679dfe8ab;hpb=8c9b9aac7d09e65195dca6681d59c10e4ef713d9 diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index b9fea68ba02..4c2e0c7a6c1 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -32,15 +32,7 @@ #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" - -struct gfx10_format { - unsigned img_format : 9; - - /* Various formats are only supported with workarounds for vertex fetch, - * and some 32_32_32 formats are supported natively, but only for buffers - * (possibly with some image support, actually, but no filtering). */ - bool buffers_only : 1; -}; +#include "util/u_blend.h" #include "gfx10_format_table.h" @@ -246,6 +238,11 @@ static void si_emit_cb_render_state(struct si_context *sctx) sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4); } break; + + case V_028C70_COLOR_5_9_9_9: + if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) + sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4); + break; } } @@ -430,13 +427,6 @@ static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned * } } -static bool si_blend_factor_uses_dst(unsigned factor) -{ - return factor == PIPE_BLENDFACTOR_DST_COLOR || factor == PIPE_BLENDFACTOR_DST_ALPHA || - factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE || - factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || factor == PIPE_BLENDFACTOR_INV_DST_COLOR; -} - static void *si_create_blend_state_mode(struct pipe_context *ctx, const struct pipe_blend_state *state, unsigned mode) { @@ -555,9 +545,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, dstA_opt = si_translate_blend_opt_factor(dstA, true); /* Handle interdependencies. */ - if (si_blend_factor_uses_dst(srcRGB)) + if (util_blend_factor_uses_dest(srcRGB, false)) dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; - if (si_blend_factor_uses_dst(srcA)) + if (util_blend_factor_uses_dest(srcA, false)) dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE && @@ -742,14 +732,15 @@ static void si_emit_clip_regs(struct si_context *sctx) struct si_shader_selector *vs_sel = vs->selector; struct si_shader_info *info = &vs_sel->info; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + bool window_space = info->stage == MESA_SHADER_VERTEX ? + info->base.vs.window_space_position : 0; unsigned clipdist_mask = vs_sel->clipdist_mask; unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS; unsigned culldist_mask = vs_sel->culldist_mask; unsigned total_mask; if (vs->key.opt.clip_disable) { - assert(!info->culldist_writemask); + assert(!info->base.cull_distance_array_size); clipdist_mask = 0; culldist_mask = 0; } @@ -766,8 +757,10 @@ static void si_emit_clip_regs(struct si_context *sctx) unsigned initial_cdw = sctx->gfx_cs->current.cdw; unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) | - S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask | - (culldist_mask << 8); + S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | + S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3) | + S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) | + clipdist_mask | (culldist_mask << 8); if (sctx->chip_class >= GFX10) { radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, @@ -1369,12 +1362,6 @@ static void si_emit_db_render_state(struct si_context *sctx) if (sctx->chip_class >= GFX7) { unsigned log_sample_rate = sctx->framebuffer.log_samples; - /* Stoney doesn't increment occlusion query counters - * if the sample rate is 16x. Use 8x sample rate instead. - */ - if (sctx->family == CHIP_STONEY) - log_sample_rate = MIN2(log_sample_rate, 3); - db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) | S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) | @@ -1399,8 +1386,9 @@ static void si_emit_db_render_state(struct si_context *sctx) radeon_opt_set_context_reg( sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2, S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | - S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | - S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4)); + S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) | + S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) | + S_028010_CENTROID_COMPUTATION_MODE(sctx->chip_class >= GFX10_3 ? 2 : 0)); db_shader_control = sctx->ps_db_shader_control; @@ -1427,7 +1415,8 @@ static void si_emit_db_render_state(struct si_context *sctx) /* * format translation */ -static uint32_t si_translate_colorformat(enum pipe_format format) +static uint32_t si_translate_colorformat(enum chip_class chip_class, + enum pipe_format format) { const struct util_format_description *desc = util_format_description(format); if (!desc) @@ -1440,6 +1429,10 @@ static uint32_t si_translate_colorformat(enum pipe_format format) if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ return V_028C70_COLOR_10_11_11; + if (chip_class >= GFX10_3 && + format == PIPE_FORMAT_R9G9B9E5_FLOAT) /* isn't plain */ + return V_028C70_COLOR_5_9_9_9; + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) return V_028C70_COLOR_INVALID; @@ -1654,7 +1647,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for if (desc->layout == UTIL_FORMAT_LAYOUT_ETC && (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 || - sscreen->info.family == CHIP_RAVEN)) { + sscreen->info.family == CHIP_RAVEN || sscreen->info.family == CHIP_RAVEN2)) { switch (format) { case PIPE_FORMAT_ETC1_RGB8: case PIPE_FORMAT_ETC2_RGB8: @@ -1780,8 +1773,8 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for case 4: switch (desc->nr_channels) { #if 0 /* Not supported for render targets */ - case 2: - return V_008F14_IMG_DATA_FORMAT_4_4; + case 2: + return V_008F14_IMG_DATA_FORMAT_4_4; #endif case 4: return V_008F14_IMG_DATA_FORMAT_4_4_4_4; @@ -1814,8 +1807,8 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for case 2: return V_008F14_IMG_DATA_FORMAT_32_32; #if 0 /* Not supported for render targets */ - case 3: - return V_008F14_IMG_DATA_FORMAT_32_32_32; + case 3: + return V_008F14_IMG_DATA_FORMAT_32_32_32; #endif case 4: return V_008F14_IMG_DATA_FORMAT_32_32_32_32; @@ -2105,9 +2098,10 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum p return usage; } -static bool si_is_colorbuffer_format_supported(enum pipe_format format) +static bool si_is_colorbuffer_format_supported(enum chip_class chip_class, + enum pipe_format format) { - return si_translate_colorformat(format) != V_028C70_COLOR_INVALID && + return si_translate_colorformat(chip_class, format) != V_028C70_COLOR_INVALID && si_translate_colorswap(format, false) != ~0U; } @@ -2140,17 +2134,23 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format !util_is_power_of_two_or_zero(storage_sample_count)) return false; + /* Chips with 1 RB don't increment occlusion queries at 16x MSAA sample rate, + * so don't expose 16 samples there. + */ + const unsigned max_eqaa_samples = sscreen->info.num_render_backends == 1 ? 8 : 16; + const unsigned max_samples = 8; + /* MSAA support without framebuffer attachments. */ - if (format == PIPE_FORMAT_NONE && sample_count <= 16) + if (format == PIPE_FORMAT_NONE && sample_count <= max_eqaa_samples) return true; if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) { /* Color without EQAA or depth/stencil. */ - if (sample_count > 8 || sample_count != storage_sample_count) + if (sample_count > max_samples || sample_count != storage_sample_count) return false; } else { /* Color with EQAA. */ - if (sample_count > 16 || storage_sample_count > 8) + if (sample_count > max_eqaa_samples || storage_sample_count > max_samples) return false; } } @@ -2167,7 +2167,7 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) && - si_is_colorbuffer_format_supported(format)) { + si_is_colorbuffer_format_supported(sscreen->info.chip_class, format)) { retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED); if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format)) @@ -2196,116 +2196,14 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap, unsigned ntype, bool is_depth) { - /* Alpha is needed for alpha-to-coverage. - * Blending may be with or without alpha. - */ - unsigned normal = 0; /* most optimal, may not support blending or export alpha */ - unsigned alpha = 0; /* exports alpha, but may not support blending */ - unsigned blend = 0; /* supports blending, but may not export alpha */ - unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */ - - /* Choose the SPI color formats. These are required values for RB+. - * Other chips have multiple choices, though they are not necessarily better. - */ - switch (format) { - case V_028C70_COLOR_5_6_5: - case V_028C70_COLOR_1_5_5_5: - case V_028C70_COLOR_5_5_5_1: - case V_028C70_COLOR_4_4_4_4: - case V_028C70_COLOR_10_11_11: - case V_028C70_COLOR_11_11_10: - case V_028C70_COLOR_8: - case V_028C70_COLOR_8_8: - case V_028C70_COLOR_8_8_8_8: - case V_028C70_COLOR_10_10_10_2: - case V_028C70_COLOR_2_10_10_10: - if (ntype == V_028C70_NUMBER_UINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; - else if (ntype == V_028C70_NUMBER_SINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; - else - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; - break; - - case V_028C70_COLOR_16: - case V_028C70_COLOR_16_16: - case V_028C70_COLOR_16_16_16_16: - if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) { - /* UNORM16 and SNORM16 don't support blending */ - if (ntype == V_028C70_NUMBER_UNORM) - normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR; - else - normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR; - - /* Use 32 bits per channel for blending. */ - if (format == V_028C70_COLOR_16) { - if (swap == V_028C70_SWAP_STD) { /* R */ - blend = V_028714_SPI_SHADER_32_R; - blend_alpha = V_028714_SPI_SHADER_32_AR; - } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ - blend = blend_alpha = V_028714_SPI_SHADER_32_AR; - else - assert(0); - } else if (format == V_028C70_COLOR_16_16) { - if (swap == V_028C70_SWAP_STD) { /* RG */ - blend = V_028714_SPI_SHADER_32_GR; - blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (swap == V_028C70_SWAP_ALT) /* RA */ - blend = blend_alpha = V_028714_SPI_SHADER_32_AR; - else - assert(0); - } else /* 16_16_16_16 */ - blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (ntype == V_028C70_NUMBER_UINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR; - else if (ntype == V_028C70_NUMBER_SINT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR; - else if (ntype == V_028C70_NUMBER_FLOAT) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR; - else - assert(0); - break; + struct ac_spi_color_formats formats = {}; - case V_028C70_COLOR_32: - if (swap == V_028C70_SWAP_STD) { /* R */ - blend = normal = V_028714_SPI_SHADER_32_R; - alpha = blend_alpha = V_028714_SPI_SHADER_32_AR; - } else if (swap == V_028C70_SWAP_ALT_REV) /* A */ - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; - else - assert(0); - break; - - case V_028C70_COLOR_32_32: - if (swap == V_028C70_SWAP_STD) { /* RG */ - blend = normal = V_028714_SPI_SHADER_32_GR; - alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR; - } else if (swap == V_028C70_SWAP_ALT) /* RA */ - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR; - else - assert(0); - break; - - case V_028C70_COLOR_32_32_32_32: - case V_028C70_COLOR_8_24: - case V_028C70_COLOR_24_8: - case V_028C70_COLOR_X24_8_32_FLOAT: - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; - break; - - default: - assert(0); - return; - } + ac_choose_spi_color_formats(format, swap, ntype, is_depth, &formats); - /* The DB->CB copy needs 32_ABGR. */ - if (is_depth) - alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR; - - surf->spi_shader_col_format = normal; - surf->spi_shader_col_format_alpha = alpha; - surf->spi_shader_col_format_blend = blend; - surf->spi_shader_col_format_blend_alpha = blend_alpha; + surf->spi_shader_col_format = formats.normal; + surf->spi_shader_col_format_alpha = formats.alpha; + surf->spi_shader_col_format_blend = formats.blend; + surf->spi_shader_col_format_blend_alpha = formats.blend_alpha; } static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf) @@ -2346,7 +2244,7 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa } } - format = si_translate_colorformat(surf->base.format); + format = si_translate_colorformat(sctx->chip_class, surf->base.format); if (format == V_028C70_COLOR_INVALID) { PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format); } @@ -2801,6 +2699,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.DB_has_shader_readable_metadata = false; sctx->framebuffer.all_DCC_pipe_aligned = true; sctx->framebuffer.min_bytes_per_pixel = 0; + sctx->framebuffer.color_big_page = true; + sctx->framebuffer.zs_big_page = true; for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) @@ -2820,6 +2720,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha << (i * 4); + sctx->framebuffer.color_big_page &= + tex->buffer.bo_alignment % (64 * 1024) == 0; + if (surf->color_is_int8) sctx->framebuffer.color_is_int8 |= 1 << i; if (surf->color_is_int10) @@ -2830,7 +2733,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, else sctx->framebuffer.uncompressed_cb_mask |= 1 << i; - if (tex->surface.dcc_offset) + if (tex->surface.display_dcc_offset) sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i; /* Don't update nr_color_samples for non-AA buffers. @@ -2885,6 +2788,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, si_init_depth_surface(sctx, surf); } + sctx->framebuffer.zs_big_page = zstex->buffer.bo_alignment % (64 * 1024) == 0; + if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS)) sctx->framebuffer.DB_has_shader_readable_metadata = true; @@ -2972,6 +2877,17 @@ static void si_emit_framebuffer_state(struct si_context *sctx) struct si_surface *cb = NULL; unsigned cb_color_info = 0; + /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */ + unsigned meta_write_policy, meta_read_policy; + /* TODO: investigate whether LRU improves performance on other chips too */ + if (sctx->screen->info.num_render_backends <= 4) { + meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */ + meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */ + } else { + meta_write_policy = V_02807C_CACHE_STREAM; /* write combine */ + meta_read_policy = V_02807C_CACHE_NOA; /* don't cache reads */ + } + /* Colorbuffers. */ for (i = 0; i < nr_cbufs; i++) { uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base; @@ -3214,6 +3130,9 @@ static void si_emit_framebuffer_state(struct si_context *sctx) } if (sctx->chip_class >= GFX10) { + bool zs_big_page = sctx->chip_class >= GFX10_3 && + sctx->framebuffer.zs_big_page; + radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); @@ -3227,12 +3146,22 @@ static void si_emit_framebuffer_state(struct si_context *sctx) radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */ radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */ - radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5); + radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 6); radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */ radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */ radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */ radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */ radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */ + radeon_emit(cs, /* DB_RMI_L2_CACHE_CONTROL */ + S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM) | + S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM) | + S_02807C_HTILE_WR_POLICY(meta_write_policy) | + S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) | + S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA) | + S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA) | + S_02807C_HTILE_RD_POLICY(meta_read_policy) | + S_02807C_Z_BIG_PAGE(zs_big_page) | + S_02807C_S_BIG_PAGE(zs_big_page)); } else if (sctx->chip_class == GFX9) { radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3); radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */ @@ -3315,10 +3244,26 @@ static void si_emit_framebuffer_state(struct si_context *sctx) } /* Framebuffer dimensions. */ - /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */ + /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */ radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR, S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); + if (nr_cbufs) { + bool color_big_page = sctx->chip_class >= GFX10_3 && + sctx->framebuffer.color_big_page; + radeon_set_context_reg(cs, R_028410_CB_RMI_GL2_CACHE_CONTROL, + S_028410_CMASK_WR_POLICY(meta_write_policy) | + S_028410_FMASK_WR_POLICY(meta_write_policy) | + S_028410_DCC_WR_POLICY(meta_write_policy) | + S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM) | + S_028410_CMASK_RD_POLICY(meta_read_policy) | + S_028410_FMASK_RD_POLICY(meta_read_policy) | + S_028410_DCC_RD_POLICY(meta_read_policy) | + S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA) | + S_028410_FMASK_BIG_PAGE(color_big_page) | + S_028410_COLOR_BIG_PAGE(color_big_page)); + } + if (sctx->screen->dfsm_allowed) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); @@ -3408,8 +3353,8 @@ static bool si_out_of_order_rasterization(struct si_context *sctx) /* The set of PS invocations is always order invariant, * except when early Z/S tests are requested. */ - if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.writes_memory && - sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] && + if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.base.writes_memory && + sctx->ps_shader.cso->info.base.fs.early_fragment_tests && !dsa_order_invariant.pass_set) return false; @@ -3544,7 +3489,8 @@ static void si_emit_msaa_config(struct si_context *sctx) sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1); sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) | S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | - S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples); + S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | + S_028BE0_COVERED_CENTROID_IS_CENTER(sctx->chip_class >= GFX10_3); if (sctx->framebuffer.nr_samples > 1) { db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | @@ -3806,11 +3752,13 @@ static void gfx10_make_texture_descriptor( state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) | S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : tex->buffer.b.b.last_level) | - S_00A014_PERF_MOD(4); + S_00A014_PERF_MOD(4) | + S_00A014_BIG_PAGE(screen->info.chip_class >= GFX10_3 && + tex->buffer.bo_alignment % (64 * 1024) == 0); state[6] = 0; state[7] = 0; - if (tex->surface.dcc_offset) { + if (vi_dcc_enabled(tex, first_level)) { state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | S_00A018_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.dcc.max_compressed_block_size) | S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); @@ -4071,7 +4019,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu state[5] |= S_008F24_LAST_ARRAY(last_layer); } - if (tex->surface.dcc_offset) { + if (vi_dcc_enabled(tex, first_level)) { state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format)); } else { /* The last dword is unused by hw. The shader uses it to clear @@ -4096,43 +4044,43 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu data_format = V_008F14_IMG_DATA_FORMAT_FMASK; switch (FMASK(res->nr_samples, res->nr_storage_samples)) { case FMASK(2, 1): - num_format = V_008F14_IMG_FMASK_8_2_1; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_1; break; case FMASK(2, 2): - num_format = V_008F14_IMG_FMASK_8_2_2; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_2; break; case FMASK(4, 1): - num_format = V_008F14_IMG_FMASK_8_4_1; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_1; break; case FMASK(4, 2): - num_format = V_008F14_IMG_FMASK_8_4_2; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_2; break; case FMASK(4, 4): - num_format = V_008F14_IMG_FMASK_8_4_4; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_4; break; case FMASK(8, 1): - num_format = V_008F14_IMG_FMASK_8_8_1; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_8_1; break; case FMASK(8, 2): - num_format = V_008F14_IMG_FMASK_16_8_2; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_16_8_2; break; case FMASK(8, 4): - num_format = V_008F14_IMG_FMASK_32_8_4; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_4; break; case FMASK(8, 8): - num_format = V_008F14_IMG_FMASK_32_8_8; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_8; break; case FMASK(16, 1): - num_format = V_008F14_IMG_FMASK_16_16_1; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_16_16_1; break; case FMASK(16, 2): - num_format = V_008F14_IMG_FMASK_32_16_2; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_16_2; break; case FMASK(16, 4): - num_format = V_008F14_IMG_FMASK_64_16_4; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_64_16_4; break; case FMASK(16, 8): - num_format = V_008F14_IMG_FMASK_64_16_8; + num_format = V_008F14_IMG_NUM_FORMAT_FMASK_64_16_8; break; default: unreachable("invalid nr_samples"); @@ -4478,6 +4426,9 @@ static void *si_create_sampler_state(struct pipe_context *ctx, struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy; unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso); + bool trunc_coord = state->min_img_filter == PIPE_TEX_FILTER_NEAREST && + state->mag_img_filter == PIPE_TEX_FILTER_NEAREST && + state->compare_mode == PIPE_TEX_COMPARE_NONE; union pipe_color_union clamped_border_color; if (!rstate) { @@ -4494,6 +4445,7 @@ static void *si_create_sampler_state(struct pipe_context *ctx, S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) | S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) | S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | + S_008F30_TRUNC_COORD(trunc_coord) | S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9)); rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) | @@ -4510,7 +4462,7 @@ static void *si_create_sampler_state(struct pipe_context *ctx, } else { rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) | S_008F38_FILTER_PREC_FIX(1) | - S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8); + S_008F38_ANISO_OVERRIDE_GFX8(sctx->chip_class >= GFX8); } /* Create sampler resource for integer textures. */ @@ -4554,7 +4506,7 @@ static void si_emit_sample_mask(struct si_context *sctx) unsigned mask = sctx->sample_mask; /* Needed for line and polygon smoothing as well as for the Polaris - * small primitive filter. We expect the gallium frontend to take care of + * small primitive filter. We expect the gallium frontend to take care of * this for us. */ assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 || @@ -4750,8 +4702,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, * into account would complicate the fast path (where everything * is nicely aligned). */ - bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 || - sscreen->info.chip_class == GFX10); + bool check_alignment = + log_hw_load_size >= 1 && + (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class >= GFX10); bool opencode = sscreen->options.vs_fetch_always_opencode; if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0) @@ -5009,8 +4962,6 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) return si_create_blend_state_mode(&sctx->b, &blend, mode); } -static void si_init_config(struct si_context *sctx); - void si_init_state_compute_functions(struct si_context *sctx) { sctx->b.create_sampler_state = si_create_sampler_state; @@ -5071,8 +5022,6 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.set_tess_state = si_set_tess_state; sctx->b.set_active_query_state = si_set_active_query_state; - - si_init_config(sctx); } void si_init_screen_state_functions(struct si_screen *sscreen) @@ -5141,7 +5090,7 @@ static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *p } } -static void si_init_config(struct si_context *sctx) +void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) { struct si_screen *sscreen = sctx->screen; uint64_t border_color_va = sctx->border_color_buffer->gpu_address; @@ -5151,59 +5100,25 @@ static void si_init_config(struct si_context *sctx) if (!pm4) return; - si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL); - si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1)); - si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1)); - si_pm4_cmd_end(pm4, false); - - if (has_clear_state) { - si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE); - si_pm4_cmd_add(pm4, 0); - si_pm4_cmd_end(pm4, false); - } - - if (sctx->chip_class <= GFX8) - si_set_raster_config(sctx, pm4); - - si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); - if (!has_clear_state) - si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); - - /* FIXME calculate these values somehow ??? */ - if (sctx->chip_class <= GFX8) { - si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); - si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); - } + if (!uses_reg_shadowing) { + si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1)); + si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1)); - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); - si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); - si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); + if (has_clear_state) { + si_pm4_cmd_add(pm4, PKT3(PKT3_CLEAR_STATE, 0, 0)); + si_pm4_cmd_add(pm4, 0); + } } - if (sscreen->info.chip_class <= GFX9) - si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); - if (!has_clear_state) - si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); - if (sctx->chip_class < GFX7) - si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, - S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); - /* CLEAR_STATE doesn't restore these correctly. */ si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1)); si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR, S_028244_BR_X(16384) | S_028244_BR_Y(16384)); - /* CLEAR_STATE doesn't clear these correctly on certain generations. - * I don't know why. Deduced by trial and error. - */ - if (sctx->chip_class <= GFX7 || !has_clear_state) { - si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); - si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); - si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); - si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR, - S_028034_BR_X(16384) | S_028034_BR_Y(16384)); - } + si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64)); + if (!has_clear_state) + si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0)); if (!has_clear_state) { si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE, @@ -5216,58 +5131,39 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0); + si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); + si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); + si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); + si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); } - if (sctx->chip_class >= GFX10) { - si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); - si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); - si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); - si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); - } else if (sctx->chip_class == GFX9) { - si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); - } else { - /* These registers, when written, also overwrite the CLEAR_STATE - * context, so we can't rely on CLEAR_STATE setting them. - * It would be an issue if there was another UMD changing them. - */ - si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0); - si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0); - si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); + if (sctx->chip_class >= GFX7) + si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40)); + + if (sctx->chip_class == GFX6) { + si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, + S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1)); } - if (sctx->chip_class >= GFX7) { - if (sctx->chip_class >= GFX10) { - /* Logical CUs 16 - 31 */ - si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff)); - si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff)); - } + if (sctx->chip_class <= GFX7 || !has_clear_state) { + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); - if (sctx->chip_class >= GFX9) { - si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, - S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); - } else { - si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, - S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, - S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); - - /* If this is 0, Bonaire can hang even if GS isn't being used. - * Other chips are unaffected. These are suboptimal values, - * but we don't use on-chip GS. - */ - si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL, - S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); - } + /* CLEAR_STATE doesn't clear these correctly on certain generations. + * I don't know why. Deduced by trial and error. + */ + si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); + si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1)); + si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0); + si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR, + S_028034_BR_X(16384) | S_028034_BR_Y(16384)); + } + if (sctx->chip_class >= GFX7) { /* Compute LATE_ALLOC_VS.LIMIT. */ - unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh; - unsigned late_alloc_wave64 = 0; /* The limit is per SH. */ + unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa; + unsigned late_alloc_wave64 = 0; /* The limit is per SA. */ unsigned cu_mask_vs = 0xffff; unsigned cu_mask_gs = 0xffff; @@ -5291,7 +5187,7 @@ static void si_init_config(struct si_context *sctx) if (!sscreen->info.use_late_alloc) { late_alloc_wave64 = 0; } else if (num_cu_per_sh <= 4) { - /* Too few available compute units per SH. Disallowing + /* Too few available compute units per SA. Disallowing * VS to run on one CU could hurt us more than late VS * allocation would help. * @@ -5314,72 +5210,44 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F)); si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64)); - si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F)); } - if (sctx->chip_class >= GFX10) { - /* Break up a pixel wave if it contains deallocs for more than - * half the parameter cache. - * - * To avoid a deadlock where pixel waves aren't launched - * because they're waiting for more pixels while the frontend - * is stuck waiting for PC space, the maximum allowed value is - * the size of the PC minus the largest possible allocation for - * a single primitive shader subgroup. - */ - si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512)); - si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + if (sctx->chip_class <= GFX8) { + si_set_raster_config(sctx, pm4); - if (!has_clear_state) { - si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, - sscreen->info.pa_sc_tile_steering_override); - } + /* FIXME calculate these values somehow ??? */ + si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); + si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); - /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */ - unsigned meta_write_policy, meta_read_policy; - /* TODO: investigate whether LRU improves performance on other chips too */ - if (sscreen->info.num_render_backends <= 4) { - meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */ - meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */ - } else { - meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */ - meta_read_policy = V_02807C_CACHE_NOA_RD; /* don't cache reads */ - } + /* These registers, when written, also overwrite the CLEAR_STATE + * context, so we can't rely on CLEAR_STATE setting them. + * It would be an issue if there was another UMD changing them. + */ + si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0); + } - si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL, - S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_HTILE_WR_POLICY(meta_write_policy) | - S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) | - S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) | - S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) | - S_02807C_HTILE_RD_POLICY(meta_read_policy)); - - si_pm4_set_reg( - pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL, - S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) | - S_028410_DCC_WR_POLICY(meta_write_policy) | - S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) | - S_028410_CMASK_RD_POLICY(meta_read_policy) | - S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) | - S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD)); - si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); + if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) { + si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, + S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F)); + si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, + S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F)); - si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, - S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); - si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); + /* If this is 0, Bonaire can hang even if GS isn't being used. + * Other chips are unaffected. These are suboptimal values, + * but we don't use on-chip GS. + */ + si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL, + S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4)); } - if (sctx->chip_class >= GFX9) { - si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, - S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) | - S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6)); - } else if (sctx->chip_class >= GFX8) { + if (sctx->chip_class == GFX8) { unsigned vgt_tess_distribution; vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) | @@ -5392,26 +5260,97 @@ static void si_init_config(struct si_context *sctx) vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3); si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution); - } else if (!has_clear_state) { - si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); - si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16); } - si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); - if (sctx->chip_class >= GFX7) { - si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40)); + if (sscreen->info.chip_class <= GFX9) { + si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); + } + + if (sctx->chip_class == GFX9) { + si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0); } - si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS); if (sctx->chip_class >= GFX9) { + si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, + S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F)); + + si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, + S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) | + S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6)); si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1, S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) | - S_028C48_MAX_PRIM_PER_BATCH(1023)); + S_028C48_MAX_PRIM_PER_BATCH(1023)); si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1)); + si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0); + si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, + sctx->chip_class >= GFX10 ? 0x20 : 0); + } + + if (sctx->chip_class >= GFX10) { + /* Logical CUs 16 - 31 */ + si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff)); + si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff)); + si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff)); + + si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0); + si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0); + si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0); + si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0); + si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0); + si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0); + si_pm4_set_reg(pm4, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0); + si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0); + si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0); + si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0); + si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0); + si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0); + si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0); + si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0); + si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0); + si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0); + + si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS, + S_00B0C0_SOFT_GROUPING_EN(1) | + S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1)); + si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0); + + si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0); + si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0); + + /* Break up a pixel wave if it contains deallocs for more than + * half the parameter cache. + * + * To avoid a deadlock where pixel waves aren't launched + * because they're waiting for more pixels while the frontend + * is stuck waiting for PC space, the maximum allowed value is + * the size of the PC minus the largest possible allocation for + * a single primitive shader subgroup. + */ + si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512)); + /* Reuse for legacy (non-NGG) only. */ + si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); + + if (!has_clear_state) { + si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE, + sscreen->info.pa_sc_tile_steering_override); + } + + + si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0); + si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0); + si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0); + si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0); + si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0); + } + + if (sctx->chip_class >= GFX10_3) { + si_pm4_set_reg(pm4, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff); + si_pm4_set_reg(pm4, 0x28848, 1 << 9); /* This fixes sample shading. */ } - si_pm4_upload_indirect_buffer(sctx, pm4); - sctx->init_config = pm4; + sctx->cs_preamble_state = pm4; }