radeonsi: remove redundant si_shader_info::writes_memory
[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
index dbafa173e02f06177ff708e5d709b1a94f0d929f..4c2e0c7a6c102af751ddd61722a621b4fcc9b771 100644 (file)
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
-
-struct gfx10_format {
-   unsigned img_format : 9;
-
-   /* Various formats are only supported with workarounds for vertex fetch,
-    * and some 32_32_32 formats are supported natively, but only for buffers
-    * (possibly with some image support, actually, but no filtering). */
-   bool buffers_only : 1;
-};
+#include "util/u_blend.h"
 
 #include "gfx10_format_table.h"
 
@@ -246,6 +238,11 @@ static void si_emit_cb_render_state(struct si_context *sctx)
                sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
             }
             break;
+
+         case V_028C70_COLOR_5_9_9_9:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
+            break;
          }
       }
 
@@ -430,13 +427,6 @@ static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *
    }
 }
 
-static bool si_blend_factor_uses_dst(unsigned factor)
-{
-   return factor == PIPE_BLENDFACTOR_DST_COLOR || factor == PIPE_BLENDFACTOR_DST_ALPHA ||
-          factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-          factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
-}
-
 static void *si_create_blend_state_mode(struct pipe_context *ctx,
                                         const struct pipe_blend_state *state, unsigned mode)
 {
@@ -455,6 +445,10 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
    blend->dual_src_blend = util_blend_state_is_dual(state, 0);
    blend->logicop_enable = logicop_enable;
 
+   unsigned num_shader_outputs = state->max_rt + 1; /* estimate */
+   if (blend->dual_src_blend)
+      num_shader_outputs = MAX2(num_shader_outputs, 2);
+
    if (logicop_enable) {
       color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
    } else {
@@ -481,7 +475,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
    blend->cb_target_mask = 0;
    blend->cb_target_enabled_4bit = 0;
 
-   for (int i = 0; i < 8; i++) {
+   for (int i = 0; i < num_shader_outputs; i++) {
       /* state->rt entries > 0 only written if independent blending */
       const int j = state->independent_blend_enable ? i : 0;
 
@@ -551,9 +545,9 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
       dstA_opt = si_translate_blend_opt_factor(dstA, true);
 
       /* Handle interdependencies. */
-      if (si_blend_factor_uses_dst(srcRGB))
+      if (util_blend_factor_uses_dest(srcRGB, false))
          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
-      if (si_blend_factor_uses_dst(srcA))
+      if (util_blend_factor_uses_dest(srcA, false))
          dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
 
       if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
@@ -584,7 +578,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 
       blend->blend_enable_4bit |= 0xfu << (i * 4);
 
-      if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
+      if (sctx->chip_class >= GFX8 && sctx->chip_class <= GFX10)
          blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
 
       /* This is only important for formats without alpha. */
@@ -595,7 +589,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
          blend->need_src_alpha_4bit |= 0xfu << (i * 4);
    }
 
-   if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
+   if (sctx->chip_class >= GFX8 && sctx->chip_class <= GFX10 && logicop_enable)
       blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
 
    if (blend->cb_target_mask) {
@@ -609,13 +603,13 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
        * Vulkan does this.
        */
       if (blend->dual_src_blend) {
-         for (int i = 0; i < 8; i++) {
+         for (int i = 0; i < num_shader_outputs; i++) {
             sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
                                   S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
          }
       }
 
-      for (int i = 0; i < 8; i++)
+      for (int i = 0; i < num_shader_outputs; i++)
          si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]);
 
       /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
@@ -738,14 +732,15 @@ static void si_emit_clip_regs(struct si_context *sctx)
    struct si_shader_selector *vs_sel = vs->selector;
    struct si_shader_info *info = &vs_sel->info;
    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-   unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+   bool window_space = info->stage == MESA_SHADER_VERTEX ?
+                          info->base.vs.window_space_position : 0;
    unsigned clipdist_mask = vs_sel->clipdist_mask;
    unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
    unsigned culldist_mask = vs_sel->culldist_mask;
    unsigned total_mask;
 
    if (vs->key.opt.clip_disable) {
-      assert(!info->culldist_writemask);
+      assert(!info->base.cull_distance_array_size);
       clipdist_mask = 0;
       culldist_mask = 0;
    }
@@ -762,8 +757,10 @@ static void si_emit_clip_regs(struct si_context *sctx)
 
    unsigned initial_cdw = sctx->gfx_cs->current.cdw;
    unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
-                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask |
-                         (culldist_mask << 8);
+                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
+                         S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
+                         S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
+                         clipdist_mask | (culldist_mask << 8);
 
    if (sctx->chip_class >= GFX10) {
       radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
@@ -1365,12 +1362,6 @@ static void si_emit_db_render_state(struct si_context *sctx)
       if (sctx->chip_class >= GFX7) {
          unsigned log_sample_rate = sctx->framebuffer.log_samples;
 
-         /* Stoney doesn't increment occlusion query counters
-          * if the sample rate is 16x. Use 8x sample rate instead.
-          */
-         if (sctx->family == CHIP_STONEY)
-            log_sample_rate = MIN2(log_sample_rate, 3);
-
          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
                             S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
                             S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) |
@@ -1395,8 +1386,9 @@ static void si_emit_db_render_state(struct si_context *sctx)
    radeon_opt_set_context_reg(
       sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2,
       S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
-         S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
-         S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
+      S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
+      S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) |
+      S_028010_CENTROID_COMPUTATION_MODE(sctx->chip_class >= GFX10_3 ? 2 : 0));
 
    db_shader_control = sctx->ps_db_shader_control;
 
@@ -1423,7 +1415,8 @@ static void si_emit_db_render_state(struct si_context *sctx)
 /*
  * format translation
  */
-static uint32_t si_translate_colorformat(enum pipe_format format)
+static uint32_t si_translate_colorformat(enum chip_class chip_class,
+                                         enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
    if (!desc)
@@ -1436,6 +1429,10 @@ static uint32_t si_translate_colorformat(enum pipe_format format)
    if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
       return V_028C70_COLOR_10_11_11;
 
+   if (chip_class >= GFX10_3 &&
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) /* isn't plain */
+      return V_028C70_COLOR_5_9_9_9;
+
    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
       return V_028C70_COLOR_INVALID;
 
@@ -1650,7 +1647,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for
 
    if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
        (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 ||
-        sscreen->info.family == CHIP_RAVEN)) {
+        sscreen->info.family == CHIP_RAVEN || sscreen->info.family == CHIP_RAVEN2)) {
       switch (format) {
       case PIPE_FORMAT_ETC1_RGB8:
       case PIPE_FORMAT_ETC2_RGB8:
@@ -1776,8 +1773,8 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for
    case 4:
       switch (desc->nr_channels) {
 #if 0 /* Not supported for render targets */
-               case 2:
-                       return V_008F14_IMG_DATA_FORMAT_4_4;
+      case 2:
+         return V_008F14_IMG_DATA_FORMAT_4_4;
 #endif
       case 4:
          return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
@@ -1810,8 +1807,8 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_for
       case 2:
          return V_008F14_IMG_DATA_FORMAT_32_32;
 #if 0 /* Not supported for render targets */
-               case 3:
-                       return V_008F14_IMG_DATA_FORMAT_32_32_32;
+      case 3:
+         return V_008F14_IMG_DATA_FORMAT_32_32_32;
 #endif
       case 4:
          return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
@@ -2101,9 +2098,10 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum p
    return usage;
 }
 
-static bool si_is_colorbuffer_format_supported(enum pipe_format format)
+static bool si_is_colorbuffer_format_supported(enum chip_class chip_class,
+                                               enum pipe_format format)
 {
-   return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
+   return si_translate_colorformat(chip_class, format) != V_028C70_COLOR_INVALID &&
           si_translate_colorswap(format, false) != ~0U;
 }
 
@@ -2136,17 +2134,23 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format
           !util_is_power_of_two_or_zero(storage_sample_count))
          return false;
 
+      /* Chips with 1 RB don't increment occlusion queries at 16x MSAA sample rate,
+       * so don't expose 16 samples there.
+       */
+      const unsigned max_eqaa_samples = sscreen->info.num_render_backends == 1 ? 8 : 16;
+      const unsigned max_samples = 8;
+
       /* MSAA support without framebuffer attachments. */
-      if (format == PIPE_FORMAT_NONE && sample_count <= 16)
+      if (format == PIPE_FORMAT_NONE && sample_count <= max_eqaa_samples)
          return true;
 
       if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) {
          /* Color without EQAA or depth/stencil. */
-         if (sample_count > 8 || sample_count != storage_sample_count)
+         if (sample_count > max_samples || sample_count != storage_sample_count)
             return false;
       } else {
          /* Color with EQAA. */
-         if (sample_count > 16 || storage_sample_count > 8)
+         if (sample_count > max_eqaa_samples || storage_sample_count > max_samples)
             return false;
       }
    }
@@ -2163,7 +2167,7 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format
 
    if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
                  PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) &&
-       si_is_colorbuffer_format_supported(format)) {
+       si_is_colorbuffer_format_supported(sscreen->info.chip_class, format)) {
       retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
                          PIPE_BIND_SHARED);
       if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format))
@@ -2192,116 +2196,14 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format
 static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap,
                                         unsigned ntype, bool is_depth)
 {
-   /* Alpha is needed for alpha-to-coverage.
-    * Blending may be with or without alpha.
-    */
-   unsigned normal = 0;      /* most optimal, may not support blending or export alpha */
-   unsigned alpha = 0;       /* exports alpha, but may not support blending */
-   unsigned blend = 0;       /* supports blending, but may not export alpha */
-   unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
-
-   /* Choose the SPI color formats. These are required values for RB+.
-    * Other chips have multiple choices, though they are not necessarily better.
-    */
-   switch (format) {
-   case V_028C70_COLOR_5_6_5:
-   case V_028C70_COLOR_1_5_5_5:
-   case V_028C70_COLOR_5_5_5_1:
-   case V_028C70_COLOR_4_4_4_4:
-   case V_028C70_COLOR_10_11_11:
-   case V_028C70_COLOR_11_11_10:
-   case V_028C70_COLOR_8:
-   case V_028C70_COLOR_8_8:
-   case V_028C70_COLOR_8_8_8_8:
-   case V_028C70_COLOR_10_10_10_2:
-   case V_028C70_COLOR_2_10_10_10:
-      if (ntype == V_028C70_NUMBER_UINT)
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
-      else if (ntype == V_028C70_NUMBER_SINT)
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
-      else
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
-      break;
-
-   case V_028C70_COLOR_16:
-   case V_028C70_COLOR_16_16:
-   case V_028C70_COLOR_16_16_16_16:
-      if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
-         /* UNORM16 and SNORM16 don't support blending */
-         if (ntype == V_028C70_NUMBER_UNORM)
-            normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
-         else
-            normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
-
-         /* Use 32 bits per channel for blending. */
-         if (format == V_028C70_COLOR_16) {
-            if (swap == V_028C70_SWAP_STD) { /* R */
-               blend = V_028714_SPI_SHADER_32_R;
-               blend_alpha = V_028714_SPI_SHADER_32_AR;
-            } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
-               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
-            else
-               assert(0);
-         } else if (format == V_028C70_COLOR_16_16) {
-            if (swap == V_028C70_SWAP_STD) { /* RG */
-               blend = V_028714_SPI_SHADER_32_GR;
-               blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-            } else if (swap == V_028C70_SWAP_ALT) /* RA */
-               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
-            else
-               assert(0);
-         } else /* 16_16_16_16 */
-            blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-      } else if (ntype == V_028C70_NUMBER_UINT)
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
-      else if (ntype == V_028C70_NUMBER_SINT)
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
-      else if (ntype == V_028C70_NUMBER_FLOAT)
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
-      else
-         assert(0);
-      break;
+   struct ac_spi_color_formats formats = {};
 
-   case V_028C70_COLOR_32:
-      if (swap == V_028C70_SWAP_STD) { /* R */
-         blend = normal = V_028714_SPI_SHADER_32_R;
-         alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
-      } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
-      else
-         assert(0);
-      break;
+   ac_choose_spi_color_formats(format, swap, ntype, is_depth, &formats);
 
-   case V_028C70_COLOR_32_32:
-      if (swap == V_028C70_SWAP_STD) { /* RG */
-         blend = normal = V_028714_SPI_SHADER_32_GR;
-         alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
-      } else if (swap == V_028C70_SWAP_ALT) /* RA */
-         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
-      else
-         assert(0);
-      break;
-
-   case V_028C70_COLOR_32_32_32_32:
-   case V_028C70_COLOR_8_24:
-   case V_028C70_COLOR_24_8:
-   case V_028C70_COLOR_X24_8_32_FLOAT:
-      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
-      break;
-
-   default:
-      assert(0);
-      return;
-   }
-
-   /* The DB->CB copy needs 32_ABGR. */
-   if (is_depth)
-      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
-
-   surf->spi_shader_col_format = normal;
-   surf->spi_shader_col_format_alpha = alpha;
-   surf->spi_shader_col_format_blend = blend;
-   surf->spi_shader_col_format_blend_alpha = blend_alpha;
+   surf->spi_shader_col_format = formats.normal;
+   surf->spi_shader_col_format_alpha = formats.alpha;
+   surf->spi_shader_col_format_blend = formats.blend;
+   surf->spi_shader_col_format_blend_alpha = formats.blend_alpha;
 }
 
 static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf)
@@ -2342,7 +2244,7 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa
       }
    }
 
-   format = si_translate_colorformat(surf->base.format);
+   format = si_translate_colorformat(sctx->chip_class, surf->base.format);
    if (format == V_028C70_COLOR_INVALID) {
       PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
    }
@@ -2412,10 +2314,10 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa
          min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
 
       surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
-                             S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+                             S_028C78_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.dcc.max_compressed_block_size) |
                              S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
-                             S_028C78_INDEPENDENT_64B_BLOCKS(0) |
-                             S_028C78_INDEPENDENT_128B_BLOCKS(1);
+                             S_028C78_INDEPENDENT_64B_BLOCKS(tex->surface.u.gfx9.dcc.independent_64B_blocks) |
+                             S_028C78_INDEPENDENT_128B_BLOCKS(tex->surface.u.gfx9.dcc.independent_128B_blocks);
    } else if (sctx->chip_class >= GFX8) {
       unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
       unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
@@ -2524,23 +2426,6 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su
       if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
          z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1);
 
-         if (tex->tc_compatible_htile) {
-            unsigned max_zplanes = 4;
-
-            if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1)
-               max_zplanes = 2;
-
-            z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
-
-            if (sctx->chip_class >= GFX10) {
-               z_info |= S_028040_ITERATE_FLUSH(1);
-               s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
-            } else {
-               z_info |= S_028038_ITERATE_FLUSH(1);
-               s_info |= S_02803C_ITERATE_FLUSH(1);
-            }
-         }
-
          if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
             /* Stencil buffer workaround ported from the GFX6-GFX8 code.
              * See that for explanation.
@@ -2553,9 +2438,9 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su
 
          surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
          surf->db_htile_surface =
-            S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
+            S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1);
          if (sctx->chip_class == GFX9) {
-            surf->db_htile_surface |= S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
+            surf->db_htile_surface |= S_028ABC_RB_ALIGNED(1);
          }
       }
    } else {
@@ -2572,7 +2457,7 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su
       z_info =
          S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
       s_info = S_028044_FORMAT(stencil_format);
-      surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
+      surf->db_depth_info = 0;
 
       if (sctx->chip_class >= GFX7) {
          struct radeon_info *info = &sctx->screen->info;
@@ -2620,28 +2505,10 @@ static void si_init_depth_surface(struct si_context *sctx, struct si_surface *su
              */
             if (tex->buffer.b.b.nr_samples <= 1)
                s_info |= S_028044_ALLOW_EXPCLEAR(1);
-         } else if (!tex->tc_compatible_htile) {
-            /* Use all of the htile_buffer for depth if there's no stencil.
-             * This must not be set when TC-compatible HTILE is enabled
-             * due to a hw bug.
-             */
-            s_info |= S_028044_TILE_STENCIL_DISABLE(1);
          }
 
          surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
          surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
-
-         if (tex->tc_compatible_htile) {
-            surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
-
-            /* 0 = full compression. N = only compress up to N-1 Z planes. */
-            if (tex->buffer.b.b.nr_samples <= 1)
-               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
-            else if (tex->buffer.b.b.nr_samples <= 4)
-               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
-            else
-               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
-         }
       }
    }
 
@@ -2832,6 +2699,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
    sctx->framebuffer.DB_has_shader_readable_metadata = false;
    sctx->framebuffer.all_DCC_pipe_aligned = true;
    sctx->framebuffer.min_bytes_per_pixel = 0;
+   sctx->framebuffer.color_big_page = true;
+   sctx->framebuffer.zs_big_page = true;
 
    for (i = 0; i < state->nr_cbufs; i++) {
       if (!state->cbufs[i])
@@ -2851,6 +2720,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
       sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha
                                                              << (i * 4);
 
+      sctx->framebuffer.color_big_page &=
+            tex->buffer.bo_alignment % (64 * 1024) == 0;
+
       if (surf->color_is_int8)
          sctx->framebuffer.color_is_int8 |= 1 << i;
       if (surf->color_is_int10)
@@ -2861,7 +2733,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
       else
          sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
 
-      if (tex->surface.dcc_offset)
+      if (tex->surface.display_dcc_offset)
          sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
 
       /* Don't update nr_color_samples for non-AA buffers.
@@ -2916,6 +2788,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
          si_init_depth_surface(sctx, surf);
       }
 
+      sctx->framebuffer.zs_big_page = zstex->buffer.bo_alignment % (64 * 1024) == 0;
+
       if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS))
          sctx->framebuffer.DB_has_shader_readable_metadata = true;
 
@@ -3003,6 +2877,17 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
    struct si_surface *cb = NULL;
    unsigned cb_color_info = 0;
 
+   /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
+   unsigned meta_write_policy, meta_read_policy;
+   /* TODO: investigate whether LRU improves performance on other chips too */
+   if (sctx->screen->info.num_render_backends <= 4) {
+      meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
+      meta_read_policy =  V_02807C_CACHE_LRU_RD; /* cache reads */
+   } else {
+      meta_write_policy = V_02807C_CACHE_STREAM;    /* write combine */
+      meta_read_policy =  V_02807C_CACHE_NOA;       /* don't cache reads */
+   }
+
    /* Colorbuffers. */
    for (i = 0; i < nr_cbufs; i++) {
       uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
@@ -3080,7 +2965,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
          cb_color_attrib3 = cb->cb_color_attrib3 |
                             S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
                             S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
-                            S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
+                            S_028EE0_CMASK_PIPE_ALIGNED(1) |
                             S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
 
          radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
@@ -3108,12 +2993,13 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
          radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
          radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
       } else if (sctx->chip_class == GFX9) {
-         struct gfx9_surf_meta_flags meta;
+         struct gfx9_surf_meta_flags meta = {
+            .rb_aligned = 1,
+            .pipe_aligned = 1,
+         };
 
          if (tex->surface.dcc_offset)
             meta = tex->surface.u.gfx9.dcc;
-         else
-            meta = tex->surface.u.gfx9.cmask;
 
          /* Set mutable surface parameters. */
          cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
@@ -3216,31 +3102,66 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
    if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
       struct si_surface *zb = (struct si_surface *)state->zsbuf;
       struct si_texture *tex = (struct si_texture *)zb->base.texture;
+      unsigned db_z_info = zb->db_z_info;
+      unsigned db_stencil_info = zb->db_stencil_info;
+      unsigned db_htile_surface = zb->db_htile_surface;
 
       radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
                                 zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA
                                                                  : RADEON_PRIO_DEPTH_BUFFER);
 
+      /* Set fields dependent on tc_compatile_htile. */
+      if (sctx->chip_class >= GFX9 &&
+          vi_tc_compat_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS)) {
+         unsigned max_zplanes = 4;
+
+         if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1)
+            max_zplanes = 2;
+
+         db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
+
+         if (sctx->chip_class >= GFX10) {
+            db_z_info |= S_028040_ITERATE_FLUSH(1);
+            db_stencil_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
+         } else {
+            db_z_info |= S_028038_ITERATE_FLUSH(1);
+            db_stencil_info |= S_02803C_ITERATE_FLUSH(1);
+         }
+      }
+
       if (sctx->chip_class >= GFX10) {
+         bool zs_big_page = sctx->chip_class >= GFX10_3 &&
+                            sctx->framebuffer.zs_big_page;
+
          radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
          radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
 
          radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
          radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
-         radeon_emit(cs, zb->db_z_info |              /* DB_Z_INFO */
+         radeon_emit(cs, db_z_info |                  /* DB_Z_INFO */
                             S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-         radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+         radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
          radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
          radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
 
-         radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
+         radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 6);
          radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
          radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
          radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
          radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
          radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
+         radeon_emit(cs, /* DB_RMI_L2_CACHE_CONTROL */
+                     S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM) |
+                     S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM) |
+                     S_02807C_HTILE_WR_POLICY(meta_write_policy) |
+                     S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) |
+                     S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA) |
+                     S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA) |
+                     S_02807C_HTILE_RD_POLICY(meta_read_policy) |
+                     S_02807C_Z_BIG_PAGE(zs_big_page) |
+                     S_02807C_S_BIG_PAGE(zs_big_page));
       } else if (sctx->chip_class == GFX9) {
          radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
          radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
@@ -3249,9 +3170,9 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
          radeon_emit(cs, zb->db_depth_size);                          /* DB_DEPTH_SIZE */
 
          radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
-         radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
+         radeon_emit(cs, db_z_info |                                   /* DB_Z_INFO */
                             S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-         radeon_emit(cs, zb->db_stencil_info);                         /* DB_STENCIL_INFO */
+         radeon_emit(cs, db_stencil_info);                             /* DB_STENCIL_INFO */
          radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_READ_BASE */
          radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
          radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
@@ -3266,13 +3187,38 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
          radeon_emit(cs, zb->db_z_info2);       /* DB_Z_INFO2 */
          radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
       } else {
+         /* GFX6-GFX8 */
+         /* Set fields dependent on tc_compatile_htile. */
+         if (si_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS)) {
+            if (!tex->surface.has_stencil && !tex->tc_compatible_htile) {
+               /* Use all of the htile_buffer for depth if there's no stencil.
+                * This must not be set when TC-compatible HTILE is enabled
+                * due to a hw bug.
+                */
+               db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
+            }
+
+            if (tex->tc_compatible_htile) {
+               db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+
+               /* 0 = full compression. N = only compress up to N-1 Z planes. */
+               if (tex->buffer.b.b.nr_samples <= 1)
+                  db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
+               else if (tex->buffer.b.b.nr_samples <= 4)
+                  db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
+               else
+                  db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
+            }
+         }
+
          radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
 
          radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
-         radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */
-         radeon_emit(cs, zb->db_z_info |     /* DB_Z_INFO */
+         radeon_emit(cs, zb->db_depth_info |   /* DB_DEPTH_INFO */
+                     S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile));
+         radeon_emit(cs, db_z_info |           /* DB_Z_INFO */
                             S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
-         radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
+         radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
          radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
          radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
@@ -3286,7 +3232,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
       radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
 
       radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
-      radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
+      radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
    } else if (sctx->framebuffer.dirty_zsbuf) {
       if (sctx->chip_class == GFX9)
          radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
@@ -3298,10 +3244,26 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
    }
 
    /* Framebuffer dimensions. */
-   /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
+   /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */
    radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
                           S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
 
+   if (nr_cbufs) {
+      bool color_big_page = sctx->chip_class >= GFX10_3 &&
+                            sctx->framebuffer.color_big_page;
+      radeon_set_context_reg(cs, R_028410_CB_RMI_GL2_CACHE_CONTROL,
+                             S_028410_CMASK_WR_POLICY(meta_write_policy) |
+                             S_028410_FMASK_WR_POLICY(meta_write_policy) |
+                             S_028410_DCC_WR_POLICY(meta_write_policy) |
+                             S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM) |
+                             S_028410_CMASK_RD_POLICY(meta_read_policy) |
+                             S_028410_FMASK_RD_POLICY(meta_read_policy) |
+                             S_028410_DCC_RD_POLICY(meta_read_policy) |
+                             S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA) |
+                             S_028410_FMASK_BIG_PAGE(color_big_page) |
+                             S_028410_COLOR_BIG_PAGE(color_big_page));
+   }
+
    if (sctx->screen->dfsm_allowed) {
       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
       radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
@@ -3391,8 +3353,8 @@ static bool si_out_of_order_rasterization(struct si_context *sctx)
 
       /* The set of PS invocations is always order invariant,
        * except when early Z/S tests are requested. */
-      if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.writes_memory &&
-          sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
+      if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.base.writes_memory &&
+          sctx->ps_shader.cso->info.base.fs.early_fragment_tests &&
           !dsa_order_invariant.pass_set)
          return false;
 
@@ -3527,7 +3489,8 @@ static void si_emit_msaa_config(struct si_context *sctx)
       sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
       sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
                      S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
-                     S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
+                     S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) |
+                     S_028BE0_COVERED_CENTROID_IS_CENTER(sctx->chip_class >= GFX10_3);
 
       if (sctx->framebuffer.nr_samples > 1) {
          db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
@@ -3789,13 +3752,15 @@ static void gfx10_make_texture_descriptor(
    state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
               S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples)
                                                    : tex->buffer.b.b.last_level) |
-              S_00A014_PERF_MOD(4);
+              S_00A014_PERF_MOD(4) |
+              S_00A014_BIG_PAGE(screen->info.chip_class >= GFX10_3 &&
+                                tex->buffer.bo_alignment % (64 * 1024) == 0);
    state[6] = 0;
    state[7] = 0;
 
-   if (tex->surface.dcc_offset) {
+   if (vi_dcc_enabled(tex, first_level)) {
       state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
-                  S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
+                  S_00A018_MAX_COMPRESSED_BLOCK_SIZE(tex->surface.u.gfx9.dcc.max_compressed_block_size) |
                   S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
    }
 
@@ -3862,7 +3827,7 @@ static void gfx10_make_texture_descriptor(
          S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
       fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer);
       fmask_state[5] = 0;
-      fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
+      fmask_state[6] = S_00A018_META_PIPE_ALIGNED(1);
       fmask_state[7] = 0;
    }
 }
@@ -3993,8 +3958,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu
    }
 
    /* S8 with Z32 HTILE needs a special format. */
-   if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT &&
-       tex->tc_compatible_htile)
+   if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT)
       data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
 
    if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY ||
@@ -4055,7 +4019,7 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu
       state[5] |= S_008F24_LAST_ARRAY(last_layer);
    }
 
-   if (tex->surface.dcc_offset) {
+   if (vi_dcc_enabled(tex, first_level)) {
       state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
    } else {
       /* The last dword is unused by hw. The shader uses it to clear
@@ -4080,43 +4044,43 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu
          data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
          switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
          case FMASK(2, 1):
-            num_format = V_008F14_IMG_FMASK_8_2_1;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_1;
             break;
          case FMASK(2, 2):
-            num_format = V_008F14_IMG_FMASK_8_2_2;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_2_2;
             break;
          case FMASK(4, 1):
-            num_format = V_008F14_IMG_FMASK_8_4_1;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_1;
             break;
          case FMASK(4, 2):
-            num_format = V_008F14_IMG_FMASK_8_4_2;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_2;
             break;
          case FMASK(4, 4):
-            num_format = V_008F14_IMG_FMASK_8_4_4;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_4_4;
             break;
          case FMASK(8, 1):
-            num_format = V_008F14_IMG_FMASK_8_8_1;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_8_8_1;
             break;
          case FMASK(8, 2):
-            num_format = V_008F14_IMG_FMASK_16_8_2;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_16_8_2;
             break;
          case FMASK(8, 4):
-            num_format = V_008F14_IMG_FMASK_32_8_4;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_4;
             break;
          case FMASK(8, 8):
-            num_format = V_008F14_IMG_FMASK_32_8_8;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_8_8;
             break;
          case FMASK(16, 1):
-            num_format = V_008F14_IMG_FMASK_16_16_1;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_16_16_1;
             break;
          case FMASK(16, 2):
-            num_format = V_008F14_IMG_FMASK_32_16_2;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_32_16_2;
             break;
          case FMASK(16, 4):
-            num_format = V_008F14_IMG_FMASK_64_16_4;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_64_16_4;
             break;
          case FMASK(16, 8):
-            num_format = V_008F14_IMG_FMASK_64_16_8;
+            num_format = V_008F14_IMG_NUM_FORMAT_FMASK_64_16_8;
             break;
          default:
             unreachable("invalid nr_samples");
@@ -4186,8 +4150,8 @@ static void si_make_texture_descriptor(struct si_screen *screen, struct si_textu
          fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
          fmask_state[4] |=
             S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
-         fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
-                           S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
+         fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(1) |
+                           S_008F24_META_RB_ALIGNED(1);
       } else {
          fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
          fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
@@ -4268,7 +4232,7 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx
       depth = u_minify(depth, force_level);
    }
 
-   /* This is not needed if state trackers set last_layer correctly. */
+   /* This is not needed if gallium frontends set last_layer correctly. */
    if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D ||
        state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE)
       last_layer = state->u.tex.first_layer;
@@ -4462,6 +4426,9 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
    struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
    unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy;
    unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
+   bool trunc_coord = state->min_img_filter == PIPE_TEX_FILTER_NEAREST &&
+                      state->mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
+                      state->compare_mode == PIPE_TEX_COMPARE_NONE;
    union pipe_color_union clamped_border_color;
 
    if (!rstate) {
@@ -4478,6 +4445,7 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
        S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
        S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) |
        S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
+       S_008F30_TRUNC_COORD(trunc_coord) |
        S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
    rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
                      S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
@@ -4494,7 +4462,7 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
    } else {
       rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
                         S_008F38_FILTER_PREC_FIX(1) |
-                        S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
+                        S_008F38_ANISO_OVERRIDE_GFX8(sctx->chip_class >= GFX8);
    }
 
    /* Create sampler resource for integer textures. */
@@ -4538,7 +4506,7 @@ static void si_emit_sample_mask(struct si_context *sctx)
    unsigned mask = sctx->sample_mask;
 
    /* Needed for line and polygon smoothing as well as for the Polaris
-    * small primitive filter. We expect the state tracker to take care of
+    * small primitive filter. We expect the gallium frontend to take care of
     * this for us.
     */
    assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
@@ -4734,8 +4702,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
        * into account would complicate the fast path (where everything
        * is nicely aligned).
        */
-      bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 ||
-                                                       sscreen->info.chip_class == GFX10);
+      bool check_alignment =
+            log_hw_load_size >= 1 &&
+            (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class >= GFX10);
       bool opencode = sscreen->options.vs_fetch_always_opencode;
 
       if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
@@ -4993,8 +4962,6 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
    return si_create_blend_state_mode(&sctx->b, &blend, mode);
 }
 
-static void si_init_config(struct si_context *sctx);
-
 void si_init_state_compute_functions(struct si_context *sctx)
 {
    sctx->b.create_sampler_state = si_create_sampler_state;
@@ -5055,8 +5022,6 @@ void si_init_state_functions(struct si_context *sctx)
    sctx->b.set_tess_state = si_set_tess_state;
 
    sctx->b.set_active_query_state = si_set_active_query_state;
-
-   si_init_config(sctx);
 }
 
 void si_init_screen_state_functions(struct si_screen *sscreen)
@@ -5125,7 +5090,7 @@ static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *p
    }
 }
 
-static void si_init_config(struct si_context *sctx)
+void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
 {
    struct si_screen *sscreen = sctx->screen;
    uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
@@ -5135,59 +5100,25 @@ static void si_init_config(struct si_context *sctx)
    if (!pm4)
       return;
 
-   si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
-   si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
-   si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
-   si_pm4_cmd_end(pm4, false);
-
-   if (has_clear_state) {
-      si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
-      si_pm4_cmd_add(pm4, 0);
-      si_pm4_cmd_end(pm4, false);
-   }
-
-   if (sctx->chip_class <= GFX8)
-      si_set_raster_config(sctx, pm4);
-
-   si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
-   if (!has_clear_state)
-      si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
-
-   /* FIXME calculate these values somehow ??? */
-   if (sctx->chip_class <= GFX8) {
-      si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
-      si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
-   }
+   if (!uses_reg_shadowing) {
+      si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+      si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1));
+      si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1));
 
-   if (!has_clear_state) {
-      si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
-      si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
-      si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+      if (has_clear_state) {
+         si_pm4_cmd_add(pm4, PKT3(PKT3_CLEAR_STATE, 0, 0));
+         si_pm4_cmd_add(pm4, 0);
+      }
    }
 
-   if (sscreen->info.chip_class <= GFX9)
-      si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
-   if (!has_clear_state)
-      si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
-   if (sctx->chip_class < GFX7)
-      si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
-                     S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1));
-
    /* CLEAR_STATE doesn't restore these correctly. */
    si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
    si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
                   S_028244_BR_X(16384) | S_028244_BR_Y(16384));
 
-   /* CLEAR_STATE doesn't clear these correctly on certain generations.
-    * I don't know why. Deduced by trial and error.
-    */
-   if (sctx->chip_class <= GFX7 || !has_clear_state) {
-      si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
-      si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
-      si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
-      si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
-                     S_028034_BR_X(16384) | S_028034_BR_Y(16384));
-   }
+   si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
+   if (!has_clear_state)
+      si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
    if (!has_clear_state) {
       si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
@@ -5200,58 +5131,39 @@ static void si_init_config(struct si_context *sctx)
       si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
       si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
       si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
+      si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
+      si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
+      si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
+      si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
    }
 
-   if (sctx->chip_class >= GFX10) {
-      si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
-      si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
-      si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
-      si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
-      si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
-      si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
-   } else if (sctx->chip_class == GFX9) {
-      si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
-      si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
-      si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
-   } else {
-      /* These registers, when written, also overwrite the CLEAR_STATE
-       * context, so we can't rely on CLEAR_STATE setting them.
-       * It would be an issue if there was another UMD changing them.
-       */
-      si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
-      si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
-      si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
+   si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
+   if (sctx->chip_class >= GFX7)
+      si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
+
+   if (sctx->chip_class == GFX6) {
+      si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
+                     S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1));
    }
 
-   if (sctx->chip_class >= GFX7) {
-      if (sctx->chip_class >= GFX10) {
-         /* Logical CUs 16 - 31 */
-         si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
-         si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
-         si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff));
-      }
+   if (sctx->chip_class <= GFX7 || !has_clear_state) {
+      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+      si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
 
-      if (sctx->chip_class >= GFX9) {
-         si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
-                        S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
-      } else {
-         si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
-                        S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
-         si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
-         si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
-                        S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
-
-         /* If this is 0, Bonaire can hang even if GS isn't being used.
-          * Other chips are unaffected. These are suboptimal values,
-          * but we don't use on-chip GS.
-          */
-         si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
-                        S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
-      }
+      /* CLEAR_STATE doesn't clear these correctly on certain generations.
+       * I don't know why. Deduced by trial and error.
+       */
+      si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
+      si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
+      si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
+      si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
+                     S_028034_BR_X(16384) | S_028034_BR_Y(16384));
+   }
 
+   if (sctx->chip_class >= GFX7) {
       /* Compute LATE_ALLOC_VS.LIMIT. */
-      unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
-      unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
+      unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
+      unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
       unsigned cu_mask_vs = 0xffff;
       unsigned cu_mask_gs = 0xffff;
 
@@ -5275,7 +5187,7 @@ static void si_init_config(struct si_context *sctx)
          if (!sscreen->info.use_late_alloc) {
             late_alloc_wave64 = 0;
          } else if (num_cu_per_sh <= 4) {
-            /* Too few available compute units per SH. Disallowing
+            /* Too few available compute units per SA. Disallowing
              * VS to run on one CU could hurt us more than late VS
              * allocation would help.
              *
@@ -5298,72 +5210,44 @@ static void si_init_config(struct si_context *sctx)
       si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
                      S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
       si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
-
       si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
                      S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
-
       si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
                      S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
    }
 
-   if (sctx->chip_class >= GFX10) {
-      /* Break up a pixel wave if it contains deallocs for more than
-       * half the parameter cache.
-       *
-       * To avoid a deadlock where pixel waves aren't launched
-       * because they're waiting for more pixels while the frontend
-       * is stuck waiting for PC space, the maximum allowed value is
-       * the size of the PC minus the largest possible allocation for
-       * a single primitive shader subgroup.
-       */
-      si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
-      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+   if (sctx->chip_class <= GFX8) {
+      si_set_raster_config(sctx, pm4);
 
-      if (!has_clear_state) {
-         si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
-                        sscreen->info.pa_sc_tile_steering_override);
-      }
+      /* FIXME calculate these values somehow ??? */
+      si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
+      si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
 
-      /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
-      unsigned meta_write_policy, meta_read_policy;
-      /* TODO: investigate whether LRU improves performance on other chips too */
-      if (sscreen->info.num_render_backends <= 4) {
-         meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
-         meta_read_policy = V_02807C_CACHE_LRU_RD;  /* cache reads */
-      } else {
-         meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
-         meta_read_policy = V_02807C_CACHE_NOA_RD;     /* don't cache reads */
-      }
+      /* These registers, when written, also overwrite the CLEAR_STATE
+       * context, so we can't rely on CLEAR_STATE setting them.
+       * It would be an issue if there was another UMD changing them.
+       */
+      si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
+   }
 
-      si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
-                     S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-                        S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-                        S_02807C_HTILE_WR_POLICY(meta_write_policy) |
-                        S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
-                        S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-                        S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
-                        S_02807C_HTILE_RD_POLICY(meta_read_policy));
-
-      si_pm4_set_reg(
-         pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
-         S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) |
-            S_028410_DCC_WR_POLICY(meta_write_policy) |
-            S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
-            S_028410_CMASK_RD_POLICY(meta_read_policy) |
-            S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) |
-            S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
-      si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
+   if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) {
+      si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+                     S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+      si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
+      si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+                     S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
 
-      si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
-                     S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
-      si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
+      /* If this is 0, Bonaire can hang even if GS isn't being used.
+       * Other chips are unaffected. These are suboptimal values,
+       * but we don't use on-chip GS.
+       */
+      si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
+                     S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
    }
 
-   if (sctx->chip_class >= GFX9) {
-      si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
-                     S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
-                        S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6));
-   } else if (sctx->chip_class >= GFX8) {
+   if (sctx->chip_class == GFX8) {
       unsigned vgt_tess_distribution;
 
       vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) |
@@ -5376,26 +5260,97 @@ static void si_init_config(struct si_context *sctx)
          vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
 
       si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
-   } else if (!has_clear_state) {
-      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
-      si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
    }
 
-   si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
-   if (sctx->chip_class >= GFX7) {
-      si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
+   if (sscreen->info.chip_class <= GFX9) {
+      si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
+   }
+
+   if (sctx->chip_class == GFX9) {
+      si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
    }
-   si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS);
 
    if (sctx->chip_class >= GFX9) {
+      si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                     S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
+
+      si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
+                     S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
+                     S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6));
       si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
                      S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
-                        S_028C48_MAX_PRIM_PER_BATCH(1023));
+                     S_028C48_MAX_PRIM_PER_BATCH(1023));
       si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
                      S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
+
       si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
+      si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY,
+                     sctx->chip_class >= GFX10 ? 0x20 : 0);
+   }
+
+   if (sctx->chip_class >= GFX10) {
+      /* Logical CUs 16 - 31 */
+      si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff));
+      si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
+      si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
+
+      si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
+      si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);
+      si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0);
+      si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0);
+      si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
+      si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0);
+      si_pm4_set_reg(pm4, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0);
+      si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0);
+      si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0);
+      si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0);
+      si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0);
+      si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0);
+      si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0);
+      si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0);
+      si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0);
+      si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0);
+
+      si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
+                     S_00B0C0_SOFT_GROUPING_EN(1) |
+                     S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
+      si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
+
+      si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
+      si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
+
+      /* Break up a pixel wave if it contains deallocs for more than
+       * half the parameter cache.
+       *
+       * To avoid a deadlock where pixel waves aren't launched
+       * because they're waiting for more pixels while the frontend
+       * is stuck waiting for PC space, the maximum allowed value is
+       * the size of the PC minus the largest possible allocation for
+       * a single primitive shader subgroup.
+       */
+      si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
+      /* Reuse for legacy (non-NGG) only. */
+      si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
+
+      if (!has_clear_state) {
+         si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
+                        sscreen->info.pa_sc_tile_steering_override);
+      }
+
+
+      si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
+      si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
+      si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
+      si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
+      si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
+   }
+
+   if (sctx->chip_class >= GFX10_3) {
+      si_pm4_set_reg(pm4, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff);
+      si_pm4_set_reg(pm4, 0x28848, 1 << 9); /* This fixes sample shading. */
    }
 
-   si_pm4_upload_indirect_buffer(sctx, pm4);
-   sctx->init_config = pm4;
+   sctx->cs_preamble_state = pm4;
 }