amd,radeonsi: rename radeon_winsys_cs -> radeon_cmdbuf
[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
index 2edd9826b2f5dbdc6c7d6208beae75559ab27227..92a1d151262214e509d215590f1e459d10baf33b 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright 2012 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *      Christian König <christian.koenig@amd.com>
  */
 
-#include "si_pipe.h"
-#include "sid.h"
+#include "si_build_pm4.h"
 #include "gfx9d.h"
-#include "radeon/r600_cs.h"
-#include "radeon/r600_query.h"
+#include "si_query.h"
 
 #include "util/u_dual_blend.h"
 #include "util/u_format.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
 
-/* Initialize an external atom (owned by ../radeon). */
-static void
-si_init_external_atom(struct si_context *sctx, struct r600_atom *atom,
-                     struct r600_atom **list_elem)
-{
-       atom->id = list_elem - sctx->atoms.array;
-       *list_elem = atom;
-}
-
-/* Initialize an atom owned by radeonsi.  */
-void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
-                 struct r600_atom **list_elem,
-                 void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
-{
-       atom->emit = (void*)emit_func;
-       atom->id = list_elem - sctx->atoms.array;
-       *list_elem = atom;
-}
-
 static unsigned si_map_swizzle(unsigned swizzle)
 {
        switch (swizzle) {
@@ -87,9 +64,9 @@ static unsigned si_pack_float_12p4(float x)
  * CB_TARGET_MASK is emitted here to avoid a hang with dual source blending
  * if there is not enough PS outputs.
  */
-static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_cb_render_state(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
        struct si_state_blend *blend = sctx->queued.named.blend;
        /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
         * but you never know. */
@@ -110,7 +87,8 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a
            (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
                cb_target_mask = 0;
 
-       radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, cb_target_mask);
+       radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK,
+                                  SI_TRACKED_CB_TARGET_MASK, cb_target_mask);
 
        /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
         * I think we don't have to do anything between IBs.
@@ -123,8 +101,27 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a
                radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
        }
 
+       if (sctx->chip_class >= VI) {
+               /* DCC MSAA workaround for blending.
+                * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
+                * COMBINER_DISABLE, but that would be more complicated.
+                */
+               bool oc_disable = (sctx->chip_class == VI ||
+                                  sctx->chip_class == GFX9) &&
+                                 blend &&
+                                 blend->blend_enable_4bit & cb_target_mask &&
+                                 sctx->framebuffer.nr_samples >= 2;
+
+               radeon_opt_set_context_reg(
+                               sctx, R_028424_CB_DCC_CONTROL,
+                               SI_TRACKED_CB_DCC_CONTROL,
+                               S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
+                               S_028424_OVERWRITE_COMBINER_WATERMARK(4) |
+                               S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable));
+       }
+
        /* RB+ register settings. */
-       if (sctx->screen->b.rbplus_allowed) {
+       if (sctx->screen->rbplus_allowed) {
                unsigned spi_shader_col_format =
                        sctx->ps_shader.cso ?
                        sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format : 0;
@@ -209,10 +206,10 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a
                                break;
 
                        case V_028C70_COLOR_32:
-                               if (swap == V_0280A0_SWAP_STD &&
+                               if (swap == V_028C70_SWAP_STD &&
                                    spi_format == V_028714_SPI_SHADER_32_R)
                                        sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
-                               else if (swap == V_0280A0_SWAP_ALT_REV &&
+                               else if (swap == V_028C70_SWAP_ALT_REV &&
                                         spi_format == V_028714_SPI_SHADER_32_AR)
                                        sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
                                break;
@@ -224,8 +221,8 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a
                                    spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
                                    spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
                                    spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
-                                       if (swap == V_0280A0_SWAP_STD ||
-                                           swap == V_0280A0_SWAP_STD_REV)
+                                       if (swap == V_028C70_SWAP_STD ||
+                                           swap == V_028C70_SWAP_STD_REV)
                                                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
                                        else
                                                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
@@ -248,15 +245,11 @@ static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *a
                        }
                }
 
-               radeon_set_context_reg_seq(cs, R_028754_SX_PS_DOWNCONVERT, 3);
-               radeon_emit(cs, sx_ps_downconvert);     /* R_028754_SX_PS_DOWNCONVERT */
-               radeon_emit(cs, sx_blend_opt_epsilon);  /* R_028758_SX_BLEND_OPT_EPSILON */
-               radeon_emit(cs, sx_blend_opt_control);  /* R_02875C_SX_BLEND_OPT_CONTROL */
-       } else if (sctx->screen->b.has_rbplus) {
-               radeon_set_context_reg_seq(cs, R_028754_SX_PS_DOWNCONVERT, 3);
-               radeon_emit(cs, 0);     /* R_028754_SX_PS_DOWNCONVERT */
-               radeon_emit(cs, 0);     /* R_028758_SX_BLEND_OPT_EPSILON */
-               radeon_emit(cs, 0);     /* R_02875C_SX_BLEND_OPT_CONTROL */
+               /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
+               radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,
+                                           SI_TRACKED_SX_PS_DOWNCONVERT,
+                                           sx_ps_downconvert, sx_blend_opt_epsilon,
+                                           sx_blend_opt_control);
        }
 }
 
@@ -278,7 +271,7 @@ static uint32_t si_translate_blend_function(int blend_func)
        case PIPE_BLEND_MAX:
                return V_028780_COMB_MAX_DST_SRC;
        default:
-               R600_ERR("Unknown blend function %d\n", blend_func);
+               PRINT_ERR("Unknown blend function %d\n", blend_func);
                assert(0);
                break;
        }
@@ -327,7 +320,7 @@ static uint32_t si_translate_blend_factor(int blend_fact)
        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
                return V_028780_BLEND_INV_SRC1_ALPHA;
        default:
-               R600_ERR("Bad blend factor %d not supported!\n", blend_fact);
+               PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
                assert(0);
                break;
        }
@@ -377,6 +370,48 @@ static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
        }
 }
 
+static void si_blend_check_commutativity(struct si_screen *sscreen,
+                                        struct si_state_blend *blend,
+                                        enum pipe_blend_func func,
+                                        enum pipe_blendfactor src,
+                                        enum pipe_blendfactor dst,
+                                        unsigned chanmask)
+{
+       /* Src factor is allowed when it does not depend on Dst */
+       static const uint32_t src_allowed =
+               (1u << PIPE_BLENDFACTOR_ONE) |
+               (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
+               (1u << PIPE_BLENDFACTOR_SRC_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
+               (1u << PIPE_BLENDFACTOR_CONST_COLOR) |
+               (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_SRC1_COLOR) |
+               (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_ZERO) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
+               (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
+               (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
+
+       if (dst == PIPE_BLENDFACTOR_ONE &&
+           (src_allowed & (1u << src))) {
+               /* Addition is commutative, but floating point addition isn't
+                * associative: subtle changes can be introduced via different
+                * rounding.
+                *
+                * Out-of-order is also non-deterministic, which means that
+                * this breaks OpenGL invariance requirements. So only enable
+                * out-of-order additive blending if explicitly allowed by a
+                * setting.
+                */
+               if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
+                   (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
+                       blend->commutative_4bit |= chanmask;
+       }
+}
+
 /**
  * Get rid of DST in the blend factors by commuting the operands:
  *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
@@ -423,6 +458,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
        blend->alpha_to_coverage = state->alpha_to_coverage;
        blend->alpha_to_one = state->alpha_to_one;
        blend->dual_src_blend = util_blend_state_is_dual(state, 0);
+       blend->logicop_enable = state->logicop_enable;
 
        if (state->logicop_enable) {
                color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
@@ -492,6 +528,11 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
                        continue;
                }
 
+               si_blend_check_commutativity(sctx->screen, blend,
+                                            eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
+               si_blend_check_commutativity(sctx->screen, blend,
+                                            eqA, srcA, dstA, 0x8 << (4 * i));
+
                /* Blending optimizations for RB+.
                 * These transformations don't change the behavior.
                 *
@@ -567,7 +608,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
                color_control |= S_028808_MODE(V_028808_CB_DISABLE);
        }
 
-       if (sctx->screen->b.has_rbplus) {
+       if (sctx->screen->rbplus_allowed) {
                /* Disable RB+ blend optimizations for dual source blending.
                 * Vulkan does this.
                 */
@@ -608,13 +649,16 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
        if (!state)
                return;
 
-       if (!old_blend ||
-            old_blend->cb_target_mask != blend->cb_target_mask ||
-            old_blend->dual_src_blend != blend->dual_src_blend)
-               si_mark_atom_dirty(sctx, &sctx->cb_render_state);
-
        si_pm4_bind_state(sctx, blend, state);
 
+       if (!old_blend ||
+           old_blend->cb_target_mask != blend->cb_target_mask ||
+           old_blend->dual_src_blend != blend->dual_src_blend ||
+           (old_blend->blend_enable_4bit != blend->blend_enable_4bit &&
+            sctx->framebuffer.nr_samples >= 2 &&
+            sctx->screen->dcc_msaa_allowed))
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
        if (!old_blend ||
            old_blend->cb_target_mask != blend->cb_target_mask ||
            old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
@@ -623,6 +667,21 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
            old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
            old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
                sctx->do_update_shaders = true;
+
+       if (sctx->screen->dpbb_allowed &&
+           (!old_blend ||
+            old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
+            old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+            old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+       if (sctx->screen->has_out_of_order_rast &&
+           (!old_blend ||
+            (old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+             old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
+             old_blend->commutative_4bit != blend->commutative_4bit ||
+             old_blend->logicop_enable != blend->logicop_enable)))
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
@@ -639,12 +698,12 @@ static void si_set_blend_color(struct pipe_context *ctx,
 
        sctx->blend_color.state = *state;
        sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-       si_mark_atom_dirty(sctx, &sctx->blend_color.atom);
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
 }
 
-static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_blend_color(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
        radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
        radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
@@ -666,7 +725,7 @@ static void si_set_clip_state(struct pipe_context *ctx,
 
        sctx->clip_state.state = *state;
        sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-       si_mark_atom_dirty(sctx, &sctx->clip_state.atom);
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
 
        cb.buffer = NULL;
        cb.user_buffer = state->ucp;
@@ -676,17 +735,16 @@ static void si_set_clip_state(struct pipe_context *ctx,
        pipe_resource_reference(&cb.buffer, NULL);
 }
 
-static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_clip_state(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
        radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
        radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
 }
 
-static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_clip_regs(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        struct si_shader *vs = si_get_vs_state(sctx);
        struct si_shader_selector *vs_sel = vs->selector;
        struct tgsi_shader_info *info = &vs_sel->info;
@@ -714,12 +772,14 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
        clipdist_mask &= rs->clip_plane_enable;
        culldist_mask |= clipdist_mask;
 
-       radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL,
+       radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+               SI_TRACKED_PA_CL_VS_OUT_CNTL,
                vs_sel->pa_cl_vs_out_cntl |
                S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
                S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
                clipdist_mask | (culldist_mask << 8));
-       radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
+       radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
+               SI_TRACKED_PA_CL_CLIP_CNTL,
                rs->pa_cl_clip_cntl |
                ucp_mask |
                S_028810_CLIP_DISABLE(window_space));
@@ -795,6 +855,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
        rs->line_stipple_enable = state->line_stipple_enable;
        rs->poly_stipple_enable = state->poly_stipple_enable;
        rs->line_smooth = state->line_smooth;
+       rs->line_width = state->line_width;
        rs->poly_smooth = state->poly_smooth;
        rs->uses_poly_offset = state->offset_point || state->offset_line ||
                               state->offset_tri;
@@ -815,7 +876,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 
        si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
                S_0286D4_FLAT_SHADE_ENA(1) |
-               S_0286D4_PNT_SPRITE_ENA(1) |
+               S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
                S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
                S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
                S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
@@ -834,20 +895,22 @@ static void *si_create_rs_state(struct pipe_context *ctx,
                psize_min = state->point_size;
                psize_max = state->point_size;
        }
+       rs->max_point_size = psize_max;
+
        /* Divide by two, because 0.5 = 1 pixel. */
        si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
                        S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min/2)) |
                        S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max/2)));
 
-       tmp = (unsigned)state->line_width * 8;
-       si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL, S_028A08_WIDTH(tmp));
+       si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
+                      S_028A08_WIDTH(si_pack_float_12p4(state->line_width/2)));
        si_pm4_set_reg(pm4, R_028A48_PA_SC_MODE_CNTL_0,
                       S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
                       S_028A48_MSAA_ENABLE(state->multisample ||
                                            state->poly_smooth ||
                                            state->line_smooth) |
                       S_028A48_VPORT_SCISSOR_ENABLE(1) |
-                      S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->b.chip_class >= GFX9));
+                      S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
 
        si_pm4_set_reg(pm4, R_028BE4_PA_SU_VTX_CNTL,
                       S_028BE4_PIX_CENTER(state->half_pixel_center) |
@@ -929,26 +992,41 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
                return;
 
        if (!old_rs || old_rs->multisample_enable != rs->multisample_enable) {
-               si_mark_atom_dirty(sctx, &sctx->db_render_state);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
                /* Update the small primitive filter workaround if necessary. */
                if (sctx->screen->has_msaa_sample_loc_bug &&
                    sctx->framebuffer.nr_samples > 1)
-                       si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
+                       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
        }
 
        sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
        sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
 
-       r600_viewport_set_rast_deps(&sctx->b, rs->scissor_enable, rs->clip_halfz);
-
        si_pm4_bind_state(sctx, rasterizer, rs);
        si_update_poly_offset_state(sctx);
 
+       if (!old_rs ||
+           old_rs->scissor_enable != rs->scissor_enable) {
+               sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
+       }
+
+       if (!old_rs ||
+           old_rs->line_width != rs->line_width ||
+           old_rs->max_point_size != rs->max_point_size)
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+       if (!old_rs ||
+           old_rs->clip_halfz != rs->clip_halfz) {
+               sctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
+       }
+
        if (!old_rs ||
            old_rs->clip_plane_enable != rs->clip_plane_enable ||
            old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
-               si_mark_atom_dirty(sctx, &sctx->clip_regs);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 
        sctx->ia_multi_vgt_param_key.u.line_stipple_enabled =
                rs->line_stipple_enable;
@@ -983,9 +1061,9 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
 /*
  * infeered state between dsa and stencil ref
  */
-static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_stencil_ref(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
        struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
        struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
 
@@ -1009,7 +1087,7 @@ static void si_set_stencil_ref(struct pipe_context *ctx,
                return;
 
        sctx->stencil_ref.state = *state;
-       si_mark_atom_dirty(sctx, &sctx->stencil_ref.atom);
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
 }
 
 
@@ -1037,16 +1115,49 @@ static uint32_t si_translate_stencil_op(int s_op)
        case PIPE_STENCIL_OP_INVERT:
                return V_02842C_STENCIL_INVERT;
        default:
-               R600_ERR("Unknown stencil op %d", s_op);
+               PRINT_ERR("Unknown stencil op %d", s_op);
                assert(0);
                break;
        }
        return 0;
 }
 
+static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s)
+{
+       return s->enabled && s->writemask &&
+              (s->fail_op  != PIPE_STENCIL_OP_KEEP ||
+               s->zfail_op != PIPE_STENCIL_OP_KEEP ||
+               s->zpass_op != PIPE_STENCIL_OP_KEEP);
+}
+
+static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
+{
+       /* REPLACE is normally order invariant, except when the stencil
+        * reference value is written by the fragment shader. Tracking this
+        * interaction does not seem worth the effort, so be conservative. */
+       return op != PIPE_STENCIL_OP_INCR &&
+              op != PIPE_STENCIL_OP_DECR &&
+              op != PIPE_STENCIL_OP_REPLACE;
+}
+
+/* Compute whether, assuming Z writes are disabled, this stencil state is order
+ * invariant in the sense that the set of passing fragments as well as the
+ * final stencil buffer result does not depend on the order of fragments. */
+static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state)
+{
+       return !state->enabled || !state->writemask ||
+              /* The following assumes that Z writes are disabled. */
+              (state->func == PIPE_FUNC_ALWAYS &&
+               si_order_invariant_stencil_op(state->zpass_op) &&
+               si_order_invariant_stencil_op(state->zfail_op)) ||
+              (state->func == PIPE_FUNC_NEVER &&
+               si_order_invariant_stencil_op(state->fail_op));
+}
+
 static void *si_create_dsa_state(struct pipe_context *ctx,
                                 const struct pipe_depth_stencil_alpha_state *state)
 {
+       struct si_context *sctx = (struct si_context *)ctx;
        struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
        struct si_pm4_state *pm4 = &dsa->pm4;
        unsigned db_depth_control;
@@ -1101,6 +1212,52 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
                si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
        }
 
+       dsa->depth_enabled = state->depth.enabled;
+       dsa->depth_write_enabled = state->depth.enabled &&
+                                  state->depth.writemask;
+       dsa->stencil_enabled = state->stencil[0].enabled;
+       dsa->stencil_write_enabled = state->stencil[0].enabled &&
+                                    (si_dsa_writes_stencil(&state->stencil[0]) ||
+                                     si_dsa_writes_stencil(&state->stencil[1]));
+       dsa->db_can_write = dsa->depth_write_enabled ||
+                           dsa->stencil_write_enabled;
+
+       bool zfunc_is_ordered =
+               state->depth.func == PIPE_FUNC_NEVER ||
+               state->depth.func == PIPE_FUNC_LESS ||
+               state->depth.func == PIPE_FUNC_LEQUAL ||
+               state->depth.func == PIPE_FUNC_GREATER ||
+               state->depth.func == PIPE_FUNC_GEQUAL;
+
+       bool nozwrite_and_order_invariant_stencil =
+               !dsa->db_can_write ||
+               (!dsa->depth_write_enabled &&
+                si_order_invariant_stencil_state(&state->stencil[0]) &&
+                si_order_invariant_stencil_state(&state->stencil[1]));
+
+       dsa->order_invariance[1].zs =
+               nozwrite_and_order_invariant_stencil ||
+               (!dsa->stencil_write_enabled && zfunc_is_ordered);
+       dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
+
+       dsa->order_invariance[1].pass_set =
+               nozwrite_and_order_invariant_stencil ||
+               (!dsa->stencil_write_enabled &&
+                (state->depth.func == PIPE_FUNC_ALWAYS ||
+                 state->depth.func == PIPE_FUNC_NEVER));
+       dsa->order_invariance[0].pass_set =
+               !dsa->depth_write_enabled ||
+               (state->depth.func == PIPE_FUNC_ALWAYS ||
+                state->depth.func == PIPE_FUNC_NEVER);
+
+       dsa->order_invariance[1].pass_last =
+               sctx->screen->assume_no_z_fights &&
+               !dsa->stencil_write_enabled &&
+               dsa->depth_write_enabled && zfunc_is_ordered;
+       dsa->order_invariance[0].pass_last =
+               sctx->screen->assume_no_z_fights &&
+               dsa->depth_write_enabled && zfunc_is_ordered;
+
        return dsa;
 }
 
@@ -1118,11 +1275,24 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
        if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
                   sizeof(struct si_dsa_stencil_ref_part)) != 0) {
                sctx->stencil_ref.dsa_part = dsa->stencil_ref;
-               si_mark_atom_dirty(sctx, &sctx->stencil_ref.atom);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
        }
 
        if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func)
                sctx->do_update_shaders = true;
+
+       if (sctx->screen->dpbb_allowed &&
+           (!old_dsa ||
+            (old_dsa->depth_enabled != dsa->depth_enabled ||
+             old_dsa->stencil_enabled != dsa->stencil_enabled ||
+             old_dsa->db_can_write != dsa->db_can_write)))
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+
+       if (sctx->screen->has_out_of_order_rast &&
+           (!old_dsa ||
+            memcmp(old_dsa->order_invariance, dsa->order_invariance,
+                   sizeof(old_dsa->order_invariance))))
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
@@ -1135,7 +1305,7 @@ static void *si_create_db_flush_dsa(struct si_context *sctx)
 {
        struct pipe_depth_stencil_alpha_state dsa = {};
 
-       return sctx->b.b.create_depth_stencil_alpha_state(&sctx->b.b, &dsa);
+       return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
 }
 
 /* DB RENDER STATE */
@@ -1146,91 +1316,95 @@ static void si_set_active_query_state(struct pipe_context *ctx, boolean enable)
 
        /* Pipeline stat & streamout queries. */
        if (enable) {
-               sctx->b.flags &= ~R600_CONTEXT_STOP_PIPELINE_STATS;
-               sctx->b.flags |= R600_CONTEXT_START_PIPELINE_STATS;
+               sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
+               sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
        } else {
-               sctx->b.flags &= ~R600_CONTEXT_START_PIPELINE_STATS;
-               sctx->b.flags |= R600_CONTEXT_STOP_PIPELINE_STATS;
+               sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
+               sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
        }
 
        /* Occlusion queries. */
        if (sctx->occlusion_queries_disabled != !enable) {
                sctx->occlusion_queries_disabled = !enable;
-               si_mark_atom_dirty(sctx, &sctx->db_render_state);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
        }
 }
 
-static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
+void si_set_occlusion_query_state(struct si_context *sctx,
+                                 bool old_perfect_enable)
 {
-       struct si_context *sctx = (struct si_context*)ctx;
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+       bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
 
-       si_mark_atom_dirty(sctx, &sctx->db_render_state);
+       if (perfect_enable != old_perfect_enable)
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
-static void si_save_qbo_state(struct pipe_context *ctx, struct r600_qbo_state *st)
+void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
 {
-       struct si_context *sctx = (struct si_context*)ctx;
-
        st->saved_compute = sctx->cs_shader_state.program;
 
        si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
        si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
 }
 
-static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
+static void si_emit_db_render_state(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-       unsigned db_shader_control;
-
-       radeon_set_context_reg_seq(cs, R_028000_DB_RENDER_CONTROL, 2);
+       unsigned db_shader_control, db_render_control, db_count_control;
 
        /* DB_RENDER_CONTROL */
        if (sctx->dbcb_depth_copy_enabled ||
            sctx->dbcb_stencil_copy_enabled) {
-               radeon_emit(cs,
-                           S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
-                           S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
-                           S_028000_COPY_CENTROID(1) |
-                           S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample));
+               db_render_control =
+                       S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
+                       S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
+                       S_028000_COPY_CENTROID(1) |
+                       S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
        } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
-               radeon_emit(cs,
-                           S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
-                           S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace));
+               db_render_control =
+                       S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
+                       S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
        } else {
-               radeon_emit(cs,
-                           S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
-                           S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear));
+               db_render_control =
+                       S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
+                       S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
        }
 
        /* DB_COUNT_CONTROL (occlusion queries) */
-       if (sctx->b.num_occlusion_queries > 0 &&
+       if (sctx->num_occlusion_queries > 0 &&
            !sctx->occlusion_queries_disabled) {
-               bool perfect = sctx->b.num_perfect_occlusion_queries > 0;
-
-               if (sctx->b.chip_class >= CIK) {
-                       radeon_emit(cs,
-                                   S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-                                   S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |
-                                   S_028004_ZPASS_ENABLE(1) |
-                                   S_028004_SLICE_EVEN_ENABLE(1) |
-                                   S_028004_SLICE_ODD_ENABLE(1));
+               bool perfect = sctx->num_perfect_occlusion_queries > 0;
+
+               if (sctx->chip_class >= CIK) {
+                       db_count_control =
+                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+                               S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |
+                               S_028004_ZPASS_ENABLE(1) |
+                               S_028004_SLICE_EVEN_ENABLE(1) |
+                               S_028004_SLICE_ODD_ENABLE(1);
                } else {
-                       radeon_emit(cs,
-                                   S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-                                   S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples));
+                       db_count_control =
+                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+                               S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
                }
        } else {
                /* Disable occlusion queries. */
-               if (sctx->b.chip_class >= CIK) {
-                       radeon_emit(cs, 0);
+               if (sctx->chip_class >= CIK) {
+                       db_count_control = 0;
                } else {
-                       radeon_emit(cs, S_028004_ZPASS_INCREMENT_DISABLE(1));
+                       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
                }
        }
 
+       radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL,
+                                   SI_TRACKED_DB_RENDER_CONTROL, db_render_control,
+                                   db_count_control);
+
        /* DB_RENDER_OVERRIDE2 */
-       radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2,
+       radeon_opt_set_context_reg(sctx,  R_028010_DB_RENDER_OVERRIDE2,
+               SI_TRACKED_DB_RENDER_OVERRIDE2,
                S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
                S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
                S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
@@ -1238,21 +1412,21 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s
        db_shader_control = sctx->ps_db_shader_control;
 
        /* Bug workaround for smoothing (overrasterization) on SI. */
-       if (sctx->b.chip_class == SI && sctx->smoothing_enabled) {
+       if (sctx->chip_class == SI && sctx->smoothing_enabled) {
                db_shader_control &= C_02880C_Z_ORDER;
                db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
        }
 
        /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
-       if (!rs || !rs->multisample_enable)
+       if (!rs->multisample_enable)
                db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
 
-       if (sctx->screen->b.has_rbplus &&
-           !sctx->screen->b.rbplus_allowed)
+       if (sctx->screen->has_rbplus &&
+           !sctx->screen->rbplus_allowed)
                db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
 
-       radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL,
-                              db_shader_control);
+       radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
+                                  SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
 }
 
 /*
@@ -1261,6 +1435,8 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s
 static uint32_t si_translate_colorformat(enum pipe_format format)
 {
        const struct util_format_description *desc = util_format_description(format);
+       if (!desc)
+               return V_028C70_COLOR_INVALID;
 
 #define HAS_SIZE(x,y,z,w) \
        (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \
@@ -1327,6 +1503,8 @@ static uint32_t si_translate_colorformat(enum pipe_format format)
                        }
                } else if (HAS_SIZE(5,5,5,1)) {
                        return V_028C70_COLOR_1_5_5_5;
+               } else if (HAS_SIZE(1,5,5,5)) {
+                       return V_028C70_COLOR_5_5_5_1;
                } else if (HAS_SIZE(10,10,10,2)) {
                        return V_028C70_COLOR_2_10_10_10;
                }
@@ -1405,9 +1583,6 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
                                       int first_non_void)
 {
        struct si_screen *sscreen = (struct si_screen*)screen;
-       bool enable_compressed_formats = (sscreen->b.info.drm_major == 2 &&
-                                         sscreen->b.info.drm_minor >= 31) ||
-                                        sscreen->b.info.drm_major == 3;
        bool uniform = true;
        int i;
 
@@ -1425,7 +1600,13 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
                         * gathers in stencil sampling. This affects at least
                         * GL45-CTS.texture_cube_map_array.sampling on VI.
                         */
-                       return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+                       if (sscreen->info.chip_class <= VI)
+                               return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+
+                       if (format == PIPE_FORMAT_X24S8_UINT)
+                               return V_008F14_IMG_DATA_FORMAT_8_24;
+                       else
+                               return V_008F14_IMG_DATA_FORMAT_24_8;
                case PIPE_FORMAT_Z24X8_UNORM:
                case PIPE_FORMAT_Z24_UNORM_S8_UINT:
                        return V_008F14_IMG_DATA_FORMAT_8_24;
@@ -1456,7 +1637,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
        }
 
        if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-               if (!enable_compressed_formats)
+               if (!sscreen->info.has_format_bc1_through_bc7)
                        goto out_unknown;
 
                switch (format) {
@@ -1476,8 +1657,9 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
        }
 
        if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
-           (sscreen->b.family == CHIP_STONEY ||
-            sscreen->b.chip_class >= GFX9)) {
+           (sscreen->info.family == CHIP_STONEY ||
+            sscreen->info.family == CHIP_VEGA10 ||
+            sscreen->info.family == CHIP_RAVEN)) {
                switch (format) {
                case PIPE_FORMAT_ETC1_RGB8:
                case PIPE_FORMAT_ETC2_RGB8:
@@ -1501,7 +1683,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
        }
 
        if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-               if (!enable_compressed_formats)
+               if (!sscreen->info.has_format_bc1_through_bc7)
                        goto out_unknown;
 
                switch (format) {
@@ -1530,13 +1712,9 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
        }
 
        if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-               if (!enable_compressed_formats)
+               if (!sscreen->info.has_format_bc1_through_bc7)
                        goto out_unknown;
 
-               if (!util_format_s3tc_enabled) {
-                       goto out_unknown;
-               }
-
                switch (format) {
                case PIPE_FORMAT_DXT1_RGB:
                case PIPE_FORMAT_DXT1_RGBA:
@@ -1589,6 +1767,12 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
                            desc->channel[3].size == 1) {
                                return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
                        }
+                       if (desc->channel[0].size == 1 &&
+                           desc->channel[1].size == 5 &&
+                           desc->channel[2].size == 5 &&
+                           desc->channel[3].size == 5) {
+                               return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
+                       }
                        if (desc->channel[0].size == 10 &&
                            desc->channel[1].size == 10 &&
                            desc->channel[2].size == 10 &&
@@ -1651,7 +1835,6 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
        }
 
 out_unknown:
-       /* R600_ERR("Unable to handle texformat %d %s\n", format, util_format_name(format)); */
        return ~0;
 }
 
@@ -1717,7 +1900,7 @@ static unsigned si_tex_compare(unsigned compare)
 static unsigned si_tex_dim(struct si_screen *sscreen, struct r600_texture *rtex,
                           unsigned view_target, unsigned nr_samples)
 {
-       unsigned res_target = rtex->resource.b.b.target;
+       unsigned res_target = rtex->buffer.b.b.target;
 
        if (view_target == PIPE_TEXTURE_CUBE ||
            view_target == PIPE_TEXTURE_CUBE_ARRAY)
@@ -1730,7 +1913,7 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct r600_texture *rtex,
        /* GFX9 allocates 1D textures as 2D. */
        if ((res_target == PIPE_TEXTURE_1D ||
             res_target == PIPE_TEXTURE_1D_ARRAY) &&
-           sscreen->b.chip_class >= GFX9 &&
+           sscreen->info.chip_class >= GFX9 &&
            rtex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
                if (res_target == PIPE_TEXTURE_1D)
                        res_target = PIPE_TEXTURE_2D;
@@ -1765,7 +1948,11 @@ static unsigned si_tex_dim(struct si_screen *sscreen, struct r600_texture *rtex,
 
 static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format)
 {
-       return si_translate_texformat(screen, format, util_format_description(format),
+       const struct util_format_description *desc = util_format_description(format);
+       if (!desc)
+               return false;
+
+       return si_translate_texformat(screen, format, desc,
                                      util_format_get_first_non_void_channel(format)) != ~0U;
 }
 
@@ -1894,6 +2081,8 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen,
                          PIPE_BIND_VERTEX_BUFFER)) == 0);
 
        desc = util_format_description(format);
+       if (!desc)
+               return 0;
 
        /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
         * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
@@ -1923,7 +2112,7 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen,
 static bool si_is_colorbuffer_format_supported(enum pipe_format format)
 {
        return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
-               r600_translate_colorswap(format, false) != ~0U;
+               si_translate_colorswap(format, false) != ~0U;
 }
 
 static bool si_is_zs_format_supported(enum pipe_format format)
@@ -1937,10 +2126,11 @@ static boolean si_is_format_supported(struct pipe_screen *screen,
                                      unsigned sample_count,
                                      unsigned usage)
 {
+       struct si_screen *sscreen = (struct si_screen *)screen;
        unsigned retval = 0;
 
        if (target >= PIPE_MAX_TEXTURE_TYPES) {
-               R600_ERR("r600: unsupported texture type %d\n", target);
+               PRINT_ERR("r600: unsupported texture type %d\n", target);
                return false;
        }
 
@@ -1960,6 +2150,10 @@ static boolean si_is_format_supported(struct pipe_screen *screen,
                case 8:
                        break;
                case 16:
+                       /* Allow resource_copy_region with nr_samples == 16. */
+                       if (sscreen->eqaa_force_coverage_samples == 16 &&
+                           !util_format_is_depth_or_stencil(format))
+                               return true;
                        if (format == PIPE_FORMAT_NONE)
                                return true;
                        else
@@ -2141,39 +2335,36 @@ static void si_initialize_color_surface(struct si_context *sctx,
                                        struct r600_surface *surf)
 {
        struct r600_texture *rtex = (struct r600_texture*)surf->base.texture;
-       unsigned color_info, color_attrib, color_view;
+       unsigned color_info, color_attrib;
        unsigned format, swap, ntype, endian;
        const struct util_format_description *desc;
-       int i;
+       int firstchan;
        unsigned blend_clamp = 0, blend_bypass = 0;
 
-       color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
-                    S_028C6C_SLICE_MAX(surf->base.u.tex.last_layer);
-
        desc = util_format_description(surf->base.format);
-       for (i = 0; i < 4; i++) {
-               if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+       for (firstchan = 0; firstchan < 4; firstchan++) {
+               if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
                        break;
                }
        }
-       if (i == 4 || desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+       if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
                ntype = V_028C70_NUMBER_FLOAT;
        } else {
                ntype = V_028C70_NUMBER_UNORM;
                if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
                        ntype = V_028C70_NUMBER_SRGB;
-               else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
-                       if (desc->channel[i].pure_integer) {
+               else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
+                       if (desc->channel[firstchan].pure_integer) {
                                ntype = V_028C70_NUMBER_SINT;
                        } else {
-                               assert(desc->channel[i].normalized);
+                               assert(desc->channel[firstchan].normalized);
                                ntype = V_028C70_NUMBER_SNORM;
                        }
-               } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-                       if (desc->channel[i].pure_integer) {
+               } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+                       if (desc->channel[firstchan].pure_integer) {
                                ntype = V_028C70_NUMBER_UINT;
                        } else {
-                               assert(desc->channel[i].normalized);
+                               assert(desc->channel[firstchan].normalized);
                                ntype = V_028C70_NUMBER_UNORM;
                        }
                }
@@ -2181,10 +2372,10 @@ static void si_initialize_color_surface(struct si_context *sctx,
 
        format = si_translate_colorformat(surf->base.format);
        if (format == V_028C70_COLOR_INVALID) {
-               R600_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
+               PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
        }
        assert(format != V_028C70_COLOR_INVALID);
-       swap = r600_translate_colorswap(surf->base.format, false);
+       swap = si_translate_colorswap(surf->base.format, false);
        endian = si_colorformat_endian_swap(format);
 
        /* blend clamp should be set for all NORM/SRGB types */
@@ -2229,58 +2420,71 @@ static void si_initialize_color_surface(struct si_context *sctx,
        color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
                                                  util_format_is_intensity(surf->base.format));
 
-       if (rtex->resource.b.b.nr_samples > 1) {
-               unsigned log_samples = util_logbase2(rtex->resource.b.b.nr_samples);
+       if (rtex->buffer.b.b.nr_samples > 1) {
+               unsigned log_samples = util_logbase2(rtex->buffer.b.b.nr_samples);
+               unsigned log_fragments = util_logbase2(rtex->num_color_samples);
 
                color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
-                               S_028C74_NUM_FRAGMENTS(log_samples);
+                               S_028C74_NUM_FRAGMENTS(log_fragments);
 
-               if (rtex->fmask.size) {
+               if (rtex->surface.fmask_size) {
                        color_info |= S_028C70_COMPRESSION(1);
-                       unsigned fmask_bankh = util_logbase2(rtex->fmask.bank_height);
+                       unsigned fmask_bankh = util_logbase2(rtex->surface.u.legacy.fmask.bankh);
 
-                       if (sctx->b.chip_class == SI) {
+                       if (sctx->chip_class == SI) {
                                /* due to a hw bug, FMASK_BANK_HEIGHT must be set on SI too */
                                color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
                        }
                }
        }
 
-       surf->cb_color_view = color_view;
-       surf->cb_color_info = color_info;
-       surf->cb_color_attrib = color_attrib;
+       if (sctx->chip_class >= VI) {
+               unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
+               unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
 
-       if (sctx->b.chip_class >= VI) {
-               unsigned max_uncompressed_block_size = 2;
+               /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
+                  64 for APU because all of our APUs to date use DIMMs which have
+                  a request granularity size of 64B while all other chips have a
+                  32B request size */
+               if (!sctx->screen->info.has_dedicated_vram)
+                       min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
 
-               if (rtex->resource.b.b.nr_samples > 1) {
+               if (rtex->num_color_samples > 1) {
                        if (rtex->surface.bpe == 1)
-                               max_uncompressed_block_size = 0;
+                               max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
                        else if (rtex->surface.bpe == 2)
-                               max_uncompressed_block_size = 1;
+                               max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
                }
 
                surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
+                                      S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
                                       S_028C78_INDEPENDENT_64B_BLOCKS(1);
        }
 
        /* This must be set for fast clear to work without FMASK. */
-       if (!rtex->fmask.size && sctx->b.chip_class == SI) {
+       if (!rtex->surface.fmask_size && sctx->chip_class == SI) {
                unsigned bankh = util_logbase2(rtex->surface.u.legacy.bankh);
-               surf->cb_color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
+               color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
        }
 
-       if (sctx->b.chip_class >= GFX9) {
-               unsigned mip0_depth = util_max_layer(&rtex->resource.b.b, 0);
+       unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
+                             S_028C6C_SLICE_MAX(surf->base.u.tex.last_layer);
+
+       if (sctx->chip_class >= GFX9) {
+               unsigned mip0_depth = util_max_layer(&rtex->buffer.b.b, 0);
 
-               surf->cb_color_view |= S_028C6C_MIP_LEVEL(surf->base.u.tex.level);
-               surf->cb_color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
-                                        S_028C74_RESOURCE_TYPE(rtex->surface.u.gfx9.resource_type);
+               color_view |= S_028C6C_MIP_LEVEL(surf->base.u.tex.level);
+               color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
+                               S_028C74_RESOURCE_TYPE(rtex->surface.u.gfx9.resource_type);
                surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
                                         S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
-                                        S_028C68_MAX_MIP(rtex->resource.b.b.last_level);
+                                        S_028C68_MAX_MIP(rtex->buffer.b.b.last_level);
        }
 
+       surf->cb_color_view = color_view;
+       surf->cb_color_info = color_info;
+       surf->cb_color_attrib = color_attrib;
+
        /* Determine pixel shader export format */
        si_choose_spi_color_formats(surf, format, swap, ntype, rtex->is_depth);
 
@@ -2296,36 +2500,36 @@ static void si_init_depth_surface(struct si_context *sctx,
        uint32_t z_info, s_info;
 
        format = si_translate_dbformat(rtex->db_render_format);
-       stencil_format = rtex->surface.flags & RADEON_SURF_SBUFFER ?
+       stencil_format = rtex->surface.has_stencil ?
                                 V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
 
        assert(format != V_028040_Z_INVALID);
        if (format == V_028040_Z_INVALID)
-               R600_ERR("Invalid DB format: %d, disabling DB.\n", rtex->resource.b.b.format);
+               PRINT_ERR("Invalid DB format: %d, disabling DB.\n", rtex->buffer.b.b.format);
 
        surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
                              S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
        surf->db_htile_data_base = 0;
        surf->db_htile_surface = 0;
 
-       if (sctx->b.chip_class >= GFX9) {
+       if (sctx->chip_class >= GFX9) {
                assert(rtex->surface.u.gfx9.surf_offset == 0);
-               surf->db_depth_base = rtex->resource.gpu_address >> 8;
-               surf->db_stencil_base = (rtex->resource.gpu_address +
+               surf->db_depth_base = rtex->buffer.gpu_address >> 8;
+               surf->db_stencil_base = (rtex->buffer.gpu_address +
                                         rtex->surface.u.gfx9.stencil_offset) >> 8;
                z_info = S_028038_FORMAT(format) |
-                        S_028038_NUM_SAMPLES(util_logbase2(rtex->resource.b.b.nr_samples)) |
+                        S_028038_NUM_SAMPLES(util_logbase2(rtex->buffer.b.b.nr_samples)) |
                         S_028038_SW_MODE(rtex->surface.u.gfx9.surf.swizzle_mode) |
-                        S_028038_MAXMIP(rtex->resource.b.b.last_level);
+                        S_028038_MAXMIP(rtex->buffer.b.b.last_level);
                s_info = S_02803C_FORMAT(stencil_format) |
                         S_02803C_SW_MODE(rtex->surface.u.gfx9.stencil.swizzle_mode);
                surf->db_z_info2 = S_028068_EPITCH(rtex->surface.u.gfx9.surf.epitch);
                surf->db_stencil_info2 = S_02806C_EPITCH(rtex->surface.u.gfx9.stencil.epitch);
                surf->db_depth_view |= S_028008_MIPID(level);
-               surf->db_depth_size = S_02801C_X_MAX(rtex->resource.b.b.width0 - 1) |
-                                     S_02801C_Y_MAX(rtex->resource.b.b.height0 - 1);
+               surf->db_depth_size = S_02801C_X_MAX(rtex->buffer.b.b.width0 - 1) |
+                                     S_02801C_Y_MAX(rtex->buffer.b.b.height0 - 1);
 
-               if (r600_htile_enabled(rtex, level)) {
+               if (si_htile_enabled(rtex, level)) {
                        z_info |= S_028038_TILE_SURFACE_ENABLE(1) |
                                  S_028038_ALLOW_EXPCLEAR(1);
 
@@ -2333,7 +2537,7 @@ static void si_init_depth_surface(struct si_context *sctx,
                                unsigned max_zplanes = 4;
 
                                if (rtex->db_render_format == PIPE_FORMAT_Z16_UNORM &&
-                                   rtex->resource.b.b.nr_samples > 1)
+                                   rtex->buffer.b.b.nr_samples > 1)
                                        max_zplanes = 2;
 
                                z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1) |
@@ -2341,17 +2545,17 @@ static void si_init_depth_surface(struct si_context *sctx,
                                s_info |= S_02803C_ITERATE_FLUSH(1);
                        }
 
-                       if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
+                       if (rtex->surface.has_stencil) {
                                /* Stencil buffer workaround ported from the SI-CI-VI code.
                                 * See that for explanation.
                                 */
-                               s_info |= S_02803C_ALLOW_EXPCLEAR(rtex->resource.b.b.nr_samples <= 1);
+                               s_info |= S_02803C_ALLOW_EXPCLEAR(rtex->buffer.b.b.nr_samples <= 1);
                        } else {
                                /* Use all HTILE for depth if there's no stencil. */
                                s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
                        }
 
-                       surf->db_htile_data_base = (rtex->resource.gpu_address +
+                       surf->db_htile_data_base = (rtex->buffer.gpu_address +
                                                    rtex->htile_offset) >> 8;
                        surf->db_htile_surface = S_028ABC_FULL_CACHE(1) |
                                                 S_028ABC_PIPE_ALIGNED(rtex->surface.u.gfx9.htile.pipe_aligned) |
@@ -2363,18 +2567,18 @@ static void si_init_depth_surface(struct si_context *sctx,
 
                assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
 
-               surf->db_depth_base = (rtex->resource.gpu_address +
+               surf->db_depth_base = (rtex->buffer.gpu_address +
                                       rtex->surface.u.legacy.level[level].offset) >> 8;
-               surf->db_stencil_base = (rtex->resource.gpu_address +
+               surf->db_stencil_base = (rtex->buffer.gpu_address +
                                         rtex->surface.u.legacy.stencil_level[level].offset) >> 8;
 
                z_info = S_028040_FORMAT(format) |
-                        S_028040_NUM_SAMPLES(util_logbase2(rtex->resource.b.b.nr_samples));
+                        S_028040_NUM_SAMPLES(util_logbase2(rtex->buffer.b.b.nr_samples));
                s_info = S_028044_FORMAT(stencil_format);
                surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!rtex->tc_compatible_htile);
 
-               if (sctx->b.chip_class >= CIK) {
-                       struct radeon_info *info = &sctx->screen->b.info;
+               if (sctx->chip_class >= CIK) {
+                       struct radeon_info *info = &sctx->screen->info;
                        unsigned index = rtex->surface.u.legacy.tiling_index[level];
                        unsigned stencil_index = rtex->surface.u.legacy.stencil_tiling_index[level];
                        unsigned macro_index = rtex->surface.u.legacy.macro_tile_index;
@@ -2403,11 +2607,11 @@ static void si_init_depth_surface(struct si_context *sctx,
                surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x *
                                                                levelinfo->nblk_y) / 64 - 1);
 
-               if (r600_htile_enabled(rtex, level)) {
+               if (si_htile_enabled(rtex, level)) {
                        z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
                                  S_028040_ALLOW_EXPCLEAR(1);
 
-                       if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
+                       if (rtex->surface.has_stencil) {
                                /* Workaround: For a not yet understood reason, the
                                 * combination of MSAA, fast stencil clear and stencil
                                 * decompress messes with subsequent stencil buffer
@@ -2419,7 +2623,7 @@ static void si_init_depth_surface(struct si_context *sctx,
                                 * Check piglit's arb_texture_multisample-stencil-clear
                                 * test if you want to try changing this.
                                 */
-                               if (rtex->resource.b.b.nr_samples <= 1)
+                               if (rtex->buffer.b.b.nr_samples <= 1)
                                        s_info |= S_028044_ALLOW_EXPCLEAR(1);
                        } else if (!rtex->tc_compatible_htile) {
                                /* Use all of the htile_buffer for depth if there's no stencil.
@@ -2429,16 +2633,17 @@ static void si_init_depth_surface(struct si_context *sctx,
                                s_info |= S_028044_TILE_STENCIL_DISABLE(1);
                        }
 
-                       surf->db_htile_data_base = (rtex->resource.gpu_address +
+                       surf->db_htile_data_base = (rtex->buffer.gpu_address +
                                                    rtex->htile_offset) >> 8;
                        surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
 
                        if (rtex->tc_compatible_htile) {
                                surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
 
-                               if (rtex->resource.b.b.nr_samples <= 1)
+                               /* 0 = full compression. N = only compress up to N-1 Z planes. */
+                               if (rtex->buffer.b.b.nr_samples <= 1)
                                        z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
-                               else if (rtex->resource.b.b.nr_samples <= 4)
+                               else if (rtex->buffer.b.b.nr_samples <= 4)
                                        z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
                                else
                                        z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
@@ -2463,24 +2668,20 @@ void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
 
                rtex->dirty_level_mask |= 1 << surf->u.tex.level;
 
-               if (rtex->surface.flags & RADEON_SURF_SBUFFER)
+               if (rtex->surface.has_stencil)
                        rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
        }
-       if (sctx->framebuffer.compressed_cb_mask) {
-               struct pipe_surface *surf;
-               struct r600_texture *rtex;
-               unsigned mask = sctx->framebuffer.compressed_cb_mask;
 
-               do {
-                       unsigned i = u_bit_scan(&mask);
-                       surf = sctx->framebuffer.state.cbufs[i];
-                       rtex = (struct r600_texture*)surf->texture;
+       unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
+       while (compressed_cb_mask) {
+               unsigned i = u_bit_scan(&compressed_cb_mask);
+               struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
+               struct r600_texture *rtex = (struct r600_texture*)surf->texture;
 
-                       if (rtex->fmask.size)
-                               rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-                       if (rtex->dcc_gather_statistics)
-                               rtex->separate_dcc_dirty = true;
-               } while (mask);
+               if (rtex->surface.fmask_size)
+                       rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+               if (rtex->dcc_gather_statistics)
+                       rtex->separate_dcc_dirty = true;
        }
 }
 
@@ -2508,6 +2709,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
        struct r600_texture *rtex;
        bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
        unsigned old_nr_samples = sctx->framebuffer.nr_samples;
+       unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
+       bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
+       bool old_has_stencil =
+               old_has_zsbuf &&
+               ((struct r600_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
        bool unbound = false;
        int i;
 
@@ -2519,7 +2725,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
                rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
                if (rtex->dcc_gather_statistics)
-                       vi_separate_dcc_stop_query(ctx, rtex);
+                       vi_separate_dcc_stop_query(sctx, rtex);
        }
 
        /* Disable DCC if the formats are incompatible. */
@@ -2544,8 +2750,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
                }
 
                if (vi_dcc_enabled(rtex, surf->base.u.tex.level))
-                       if (!r600_texture_disable_dcc(&sctx->b, rtex))
-                               sctx->b.decompress_dcc(ctx, rtex);
+                       if (!si_texture_disable_dcc(sctx, rtex))
+                               si_decompress_dcc(sctx, rtex);
 
                surf->dcc_incompatible = false;
        }
@@ -2567,12 +2773,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
         *
         * Only flush and wait for CB if there is actually a bound color buffer.
         */
-       if (sctx->framebuffer.nr_samples <= 1 &&
-           sctx->framebuffer.state.nr_cbufs)
+       if (sctx->framebuffer.uncompressed_cb_mask)
                si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
                                           sctx->framebuffer.CB_has_shader_readable_metadata);
 
-       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        /* u_blitter doesn't invoke depth decompression when it does multiple
         * blits in a row, but the only case when it matters for DB is when
@@ -2580,9 +2785,18 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
         * individual generate_mipmap blits.
         * Note that lower mipmap levels aren't compressed.
         */
-       if (sctx->generate_mipmap_for_depth)
+       if (sctx->generate_mipmap_for_depth) {
                si_make_DB_shader_coherent(sctx, 1, false,
                                           sctx->framebuffer.DB_has_shader_readable_metadata);
+       } else if (sctx->chip_class == GFX9) {
+               /* It appears that DB metadata "leaks" in a sequence of:
+                *  - depth clear
+                *  - DCC decompress for shader image writes (with DB disabled)
+                *  - render with DEPTH_BEFORE_SHADER=1
+                * Flushing DB metadata works around the problem.
+                */
+               sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
+       }
 
        /* Take the maximum of the old and new count. If the new count is lower,
         * dirtying is needed to disable the unbound colorbuffers.
@@ -2603,7 +2817,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
        sctx->framebuffer.color_is_int10 = 0;
 
        sctx->framebuffer.compressed_cb_mask = 0;
+       sctx->framebuffer.uncompressed_cb_mask = 0;
        sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
+       sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
        sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
        sctx->framebuffer.any_dst_linear = false;
        sctx->framebuffer.CB_has_shader_readable_metadata = false;
@@ -2635,8 +2851,19 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
                if (surf->color_is_int10)
                        sctx->framebuffer.color_is_int10 |= 1 << i;
 
-               if (rtex->fmask.size) {
+               if (rtex->surface.fmask_size)
                        sctx->framebuffer.compressed_cb_mask |= 1 << i;
+               else
+                       sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
+
+               /* Don't update nr_color_samples for non-AA buffers.
+                * (e.g. destination of MSAA resolve)
+                */
+               if (rtex->buffer.b.b.nr_samples >= 2 &&
+                   rtex->num_color_samples < rtex->buffer.b.b.nr_samples) {
+                       sctx->framebuffer.nr_color_samples =
+                               MIN2(sctx->framebuffer.nr_color_samples,
+                                    rtex->num_color_samples);
                }
 
                if (rtex->surface.is_linear)
@@ -2645,68 +2872,80 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
                if (vi_dcc_enabled(rtex, surf->base.u.tex.level))
                        sctx->framebuffer.CB_has_shader_readable_metadata = true;
 
-               r600_context_add_resource_size(ctx, surf->base.texture);
+               si_context_add_resource_size(sctx, surf->base.texture);
 
                p_atomic_inc(&rtex->framebuffers_bound);
 
                if (rtex->dcc_gather_statistics) {
                        /* Dirty tracking must be enabled for DCC usage analysis. */
                        sctx->framebuffer.compressed_cb_mask |= 1 << i;
-                       vi_separate_dcc_start_query(ctx, rtex);
+                       vi_separate_dcc_start_query(sctx, rtex);
                }
        }
 
+       struct r600_texture *zstex = NULL;
+
        if (state->zsbuf) {
                surf = (struct r600_surface*)state->zsbuf;
-               rtex = (struct r600_texture*)surf->base.texture;
+               zstex = (struct r600_texture*)surf->base.texture;
 
                if (!surf->depth_initialized) {
                        si_init_depth_surface(sctx, surf);
                }
 
-               if (vi_tc_compat_htile_enabled(rtex, surf->base.u.tex.level))
+               if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level))
                        sctx->framebuffer.DB_has_shader_readable_metadata = true;
 
-               r600_context_add_resource_size(ctx, surf->base.texture);
+               si_context_add_resource_size(sctx, surf->base.texture);
        }
 
+       si_update_ps_colorbuf0_slot(sctx);
        si_update_poly_offset_state(sctx);
-       si_mark_atom_dirty(sctx, &sctx->cb_render_state);
-       si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+
+       if (sctx->screen->dpbb_allowed)
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 
        if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
-               si_mark_atom_dirty(sctx, &sctx->msaa_config);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+       if (sctx->screen->has_out_of_order_rast &&
+           (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
+            !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
+            (zstex && zstex->surface.has_stencil != old_has_stencil)))
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 
        if (sctx->framebuffer.nr_samples != old_nr_samples) {
-               si_mark_atom_dirty(sctx, &sctx->msaa_config);
-               si_mark_atom_dirty(sctx, &sctx->db_render_state);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
                /* Set sample locations as fragment shader constants. */
                switch (sctx->framebuffer.nr_samples) {
                case 1:
-                       constbuf.user_buffer = sctx->b.sample_locations_1x;
+                       constbuf.user_buffer = sctx->sample_locations_1x;
                        break;
                case 2:
-                       constbuf.user_buffer = sctx->b.sample_locations_2x;
+                       constbuf.user_buffer = sctx->sample_locations_2x;
                        break;
                case 4:
-                       constbuf.user_buffer = sctx->b.sample_locations_4x;
+                       constbuf.user_buffer = sctx->sample_locations_4x;
                        break;
                case 8:
-                       constbuf.user_buffer = sctx->b.sample_locations_8x;
+                       constbuf.user_buffer = sctx->sample_locations_8x;
                        break;
                case 16:
-                       constbuf.user_buffer = sctx->b.sample_locations_16x;
+                       constbuf.user_buffer = sctx->sample_locations_16x;
                        break;
                default:
-                       R600_ERR("Requested an invalid number of samples %i.\n",
+                       PRINT_ERR("Requested an invalid number of samples %i.\n",
                                 sctx->framebuffer.nr_samples);
                        assert(0);
                }
                constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
                si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
 
-               si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
        }
 
        sctx->do_update_shaders = true;
@@ -2719,9 +2958,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
        }
 }
 
-static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_framebuffer_state(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
        struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
        unsigned i, nr_cbufs = state->nr_cbufs;
        struct r600_texture *tex = NULL;
@@ -2730,7 +2969,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 
        /* Colorbuffers. */
        for (i = 0; i < nr_cbufs; i++) {
-               uint64_t cb_color_base, cb_color_fmask, cb_dcc_base;
+               uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
                unsigned cb_color_attrib;
 
                if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
@@ -2744,34 +2983,38 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                }
 
                tex = (struct r600_texture *)cb->base.texture;
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-                                     &tex->resource, RADEON_USAGE_READWRITE,
-                                     tex->resource.b.b.nr_samples > 1 ?
+               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+                                     &tex->buffer, RADEON_USAGE_READWRITE,
+                                     tex->buffer.b.b.nr_samples > 1 ?
                                              RADEON_PRIO_COLOR_BUFFER_MSAA :
                                              RADEON_PRIO_COLOR_BUFFER);
 
-               if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
-                       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+               if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
+                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
                                tex->cmask_buffer, RADEON_USAGE_READWRITE,
                                RADEON_PRIO_CMASK);
                }
 
                if (tex->dcc_separate_buffer)
-                       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+                       radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
                                                  tex->dcc_separate_buffer,
                                                  RADEON_USAGE_READWRITE,
                                                  RADEON_PRIO_DCC);
 
                /* Compute mutable surface parameters. */
-               cb_color_base = tex->resource.gpu_address >> 8;
+               cb_color_base = tex->buffer.gpu_address >> 8;
                cb_color_fmask = 0;
+               cb_color_cmask = tex->cmask.base_address_reg;
                cb_dcc_base = 0;
                cb_color_info = cb->cb_color_info | tex->cb_color_info;
                cb_color_attrib = cb->cb_color_attrib;
 
-               if (tex->fmask.size) {
-                       cb_color_fmask = (tex->resource.gpu_address + tex->fmask.offset) >> 8;
-                       cb_color_fmask |= tex->fmask.tile_swizzle;
+               if (cb->base.u.tex.level > 0)
+                       cb_color_info &= C_028C70_FAST_CLEAR;
+
+               if (tex->surface.fmask_size) {
+                       cb_color_fmask = (tex->buffer.gpu_address + tex->fmask_offset) >> 8;
+                       cb_color_fmask |= tex->surface.fmask_tile_swizzle;
                }
 
                /* Set up DCC. */
@@ -2784,12 +3027,12 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                        if (!is_msaa_resolve_dst)
                                cb_color_info |= S_028C70_DCC_ENABLE(1);
 
-                       cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
+                       cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
                                       tex->dcc_offset) >> 8;
                        cb_dcc_base |= tex->surface.tile_swizzle;
                }
 
-               if (sctx->b.chip_class >= GFX9) {
+               if (sctx->chip_class >= GFX9) {
                        struct gfx9_surf_meta_flags meta;
 
                        if (tex->dcc_offset)
@@ -2800,8 +3043,10 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                        /* Set mutable surface parameters. */
                        cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
                        cb_color_base |= tex->surface.tile_swizzle;
-                       if (!tex->fmask.size)
+                       if (!tex->surface.fmask_size)
                                cb_color_fmask = cb_color_base;
+                       if (cb->base.u.tex.level > 0)
+                               cb_color_cmask = cb_color_base;
                        cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
                                           S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
                                           S_028C74_RB_ALIGNED(meta.rb_aligned) |
@@ -2809,20 +3054,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 
                        radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
                        radeon_emit(cs, cb_color_base);         /* CB_COLOR0_BASE */
-                       radeon_emit(cs, cb_color_base >> 32);   /* CB_COLOR0_BASE_EXT */
+                       radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
                        radeon_emit(cs, cb->cb_color_attrib2);  /* CB_COLOR0_ATTRIB2 */
                        radeon_emit(cs, cb->cb_color_view);     /* CB_COLOR0_VIEW */
                        radeon_emit(cs, cb_color_info);         /* CB_COLOR0_INFO */
                        radeon_emit(cs, cb_color_attrib);       /* CB_COLOR0_ATTRIB */
                        radeon_emit(cs, cb->cb_dcc_control);    /* CB_COLOR0_DCC_CONTROL */
-                       radeon_emit(cs, tex->cmask.base_address_reg); /* CB_COLOR0_CMASK */
-                       radeon_emit(cs, tex->cmask.base_address_reg >> 32); /* CB_COLOR0_CMASK_BASE_EXT */
+                       radeon_emit(cs, cb_color_cmask);        /* CB_COLOR0_CMASK */
+                       radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
                        radeon_emit(cs, cb_color_fmask);        /* CB_COLOR0_FMASK */
-                       radeon_emit(cs, cb_color_fmask >> 32);  /* CB_COLOR0_FMASK_BASE_EXT */
+                       radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
                        radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
                        radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
                        radeon_emit(cs, cb_dcc_base);           /* CB_COLOR0_DCC_BASE */
-                       radeon_emit(cs, cb_dcc_base >> 32);     /* CB_COLOR0_DCC_BASE_EXT */
+                       radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
 
                        radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
                                               S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
@@ -2838,8 +3083,10 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                        if (level_info->mode == RADEON_SURF_MODE_2D)
                                cb_color_base |= tex->surface.tile_swizzle;
 
-                       if (!tex->fmask.size)
+                       if (!tex->surface.fmask_size)
                                cb_color_fmask = cb_color_base;
+                       if (cb->base.u.tex.level > 0)
+                               cb_color_cmask = cb_color_base;
                        if (cb_dcc_base)
                                cb_dcc_base += level_info->dcc_offset >> 8;
 
@@ -2852,21 +3099,21 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                        cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
                        cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
 
-                       if (tex->fmask.size) {
-                               if (sctx->b.chip_class >= CIK)
-                                       cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->fmask.pitch_in_pixels / 8 - 1);
-                               cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->fmask.tile_mode_index);
-                               cb_color_fmask_slice = S_028C88_TILE_MAX(tex->fmask.slice_tile_max);
+                       if (tex->surface.fmask_size) {
+                               if (sctx->chip_class >= CIK)
+                                       cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
+                               cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+                               cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
                        } else {
                                /* This must be set for fast clear to work without FMASK. */
-                               if (sctx->b.chip_class >= CIK)
+                               if (sctx->chip_class >= CIK)
                                        cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
                                cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
                                cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
                        }
 
                        radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
-                                                  sctx->b.chip_class >= VI ? 14 : 13);
+                                                  sctx->chip_class >= VI ? 14 : 13);
                        radeon_emit(cs, cb_color_base);         /* CB_COLOR0_BASE */
                        radeon_emit(cs, cb_color_pitch);        /* CB_COLOR0_PITCH */
                        radeon_emit(cs, cb_color_slice);        /* CB_COLOR0_SLICE */
@@ -2874,14 +3121,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                        radeon_emit(cs, cb_color_info);         /* CB_COLOR0_INFO */
                        radeon_emit(cs, cb_color_attrib);       /* CB_COLOR0_ATTRIB */
                        radeon_emit(cs, cb->cb_dcc_control);    /* CB_COLOR0_DCC_CONTROL */
-                       radeon_emit(cs, tex->cmask.base_address_reg);   /* CB_COLOR0_CMASK */
+                       radeon_emit(cs, cb_color_cmask);        /* CB_COLOR0_CMASK */
                        radeon_emit(cs, tex->cmask.slice_tile_max);     /* CB_COLOR0_CMASK_SLICE */
                        radeon_emit(cs, cb_color_fmask);                /* CB_COLOR0_FMASK */
                        radeon_emit(cs, cb_color_fmask_slice);          /* CB_COLOR0_FMASK_SLICE */
                        radeon_emit(cs, tex->color_clear_value[0]);     /* CB_COLOR0_CLEAR_WORD0 */
                        radeon_emit(cs, tex->color_clear_value[1]);     /* CB_COLOR0_CLEAR_WORD1 */
 
-                       if (sctx->b.chip_class >= VI) /* R_028C94_CB_COLOR0_DCC_BASE */
+                       if (sctx->chip_class >= VI) /* R_028C94_CB_COLOR0_DCC_BASE */
                                radeon_emit(cs, cb_dcc_base);
                }
        }
@@ -2894,16 +3141,16 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
                struct r600_texture *rtex = (struct r600_texture*)zb->base.texture;
 
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-                                     &rtex->resource, RADEON_USAGE_READWRITE,
+               radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+                                     &rtex->buffer, RADEON_USAGE_READWRITE,
                                      zb->base.texture->nr_samples > 1 ?
                                              RADEON_PRIO_DEPTH_BUFFER_MSAA :
                                              RADEON_PRIO_DEPTH_BUFFER);
 
-               if (sctx->b.chip_class >= GFX9) {
+               if (sctx->chip_class >= GFX9) {
                        radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
                        radeon_emit(cs, zb->db_htile_data_base);        /* DB_HTILE_DATA_BASE */
-                       radeon_emit(cs, zb->db_htile_data_base >> 32);  /* DB_HTILE_DATA_BASE_HI */
+                       radeon_emit(cs, S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
                        radeon_emit(cs, zb->db_depth_size);             /* DB_DEPTH_SIZE */
 
                        radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
@@ -2911,13 +3158,13 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                                    S_028038_ZRANGE_PRECISION(rtex->depth_clear_value != 0));
                        radeon_emit(cs, zb->db_stencil_info);           /* DB_STENCIL_INFO */
                        radeon_emit(cs, zb->db_depth_base);             /* DB_Z_READ_BASE */
-                       radeon_emit(cs, zb->db_depth_base >> 32);       /* DB_Z_READ_BASE_HI */
+                       radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
                        radeon_emit(cs, zb->db_stencil_base);           /* DB_STENCIL_READ_BASE */
-                       radeon_emit(cs, zb->db_stencil_base >> 32);     /* DB_STENCIL_READ_BASE_HI */
+                       radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
                        radeon_emit(cs, zb->db_depth_base);             /* DB_Z_WRITE_BASE */
-                       radeon_emit(cs, zb->db_depth_base >> 32);       /* DB_Z_WRITE_BASE_HI */
+                       radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
                        radeon_emit(cs, zb->db_stencil_base);           /* DB_STENCIL_WRITE_BASE */
-                       radeon_emit(cs, zb->db_stencil_base >> 32);     /* DB_STENCIL_WRITE_BASE_HI */
+                       radeon_emit(cs, S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
 
                        radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
                        radeon_emit(cs, zb->db_z_info2);        /* DB_Z_INFO2 */
@@ -2945,7 +3192,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
                radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
                radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
        } else if (sctx->framebuffer.dirty_zsbuf) {
-               if (sctx->b.chip_class >= GFX9)
+               if (sctx->chip_class >= GFX9)
                        radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
                else
                        radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
@@ -2968,10 +3215,9 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
        sctx->framebuffer.dirty_zsbuf = false;
 }
 
-static void si_emit_msaa_sample_locs(struct si_context *sctx,
-                                    struct r600_atom *atom)
+static void si_emit_msaa_sample_locs(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
        unsigned nr_samples = sctx->framebuffer.nr_samples;
        bool has_msaa_sample_loc_bug = sctx->screen->has_msaa_sample_loc_bug;
 
@@ -2987,17 +3233,17 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
        if (has_msaa_sample_loc_bug)
                nr_samples = MAX2(nr_samples, 1);
 
-       if (nr_samples != sctx->msaa_sample_locs.nr_samples) {
-               sctx->msaa_sample_locs.nr_samples = nr_samples;
-               cayman_emit_msaa_sample_locs(cs, nr_samples);
+       if (nr_samples != sctx->sample_locs_num_samples) {
+               sctx->sample_locs_num_samples = nr_samples;
+               si_emit_sample_locations(cs, nr_samples);
        }
 
-       if (sctx->b.family >= CHIP_POLARIS10) {
+       if (sctx->family >= CHIP_POLARIS10) {
                struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
                unsigned small_prim_filter_cntl =
                        S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
                        /* line bug */
-                       S_028830_LINE_FILTER_DISABLE(sctx->b.family <= CHIP_POLARIS12);
+                       S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
 
                /* The alternative of setting sample locations to 0 would
                 * require a DB flush to avoid Z errors, see
@@ -3005,24 +3251,96 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
                 */
                if (has_msaa_sample_loc_bug &&
                    sctx->framebuffer.nr_samples > 1 &&
-                   rs && !rs->multisample_enable)
+                   !rs->multisample_enable)
                        small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
 
-               radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
-                                      small_prim_filter_cntl);
+               radeon_opt_set_context_reg(sctx,
+                                          R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
+                                          SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
+                                          small_prim_filter_cntl);
        }
 }
 
-static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
+static bool si_out_of_order_rasterization(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-       unsigned num_tile_pipes = sctx->screen->b.info.num_tile_pipes;
+       struct si_state_blend *blend = sctx->queued.named.blend;
+       struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+       if (!sctx->screen->has_out_of_order_rast)
+               return false;
+
+       unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
+
+       if (blend) {
+               colormask &= blend->cb_target_enabled_4bit;
+       } else {
+               colormask = 0;
+       }
+
+       /* Conservative: No logic op. */
+       if (colormask && blend->logicop_enable)
+               return false;
+
+       struct si_dsa_order_invariance dsa_order_invariant = {
+               .zs = true, .pass_set = true, .pass_last = false
+       };
+
+       if (sctx->framebuffer.state.zsbuf) {
+               struct r600_texture *zstex =
+                       (struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
+               bool has_stencil = zstex->surface.has_stencil;
+               dsa_order_invariant = dsa->order_invariance[has_stencil];
+               if (!dsa_order_invariant.zs)
+                       return false;
+
+               /* The set of PS invocations is always order invariant,
+                * except when early Z/S tests are requested. */
+               if (sctx->ps_shader.cso &&
+                   sctx->ps_shader.cso->info.writes_memory &&
+                   sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
+                   !dsa_order_invariant.pass_set)
+                       return false;
+
+               if (sctx->num_perfect_occlusion_queries != 0 &&
+                   !dsa_order_invariant.pass_set)
+                       return false;
+       }
+
+       if (!colormask)
+               return true;
+
+       unsigned blendmask = colormask & blend->blend_enable_4bit;
+
+       if (blendmask) {
+               /* Only commutative blending. */
+               if (blendmask & ~blend->commutative_4bit)
+                       return false;
+
+               if (!dsa_order_invariant.pass_set)
+                       return false;
+       }
+
+       if (colormask & ~blendmask) {
+               if (!dsa_order_invariant.pass_last)
+                       return false;
+       }
+
+       return true;
+}
+
+static void si_emit_msaa_config(struct si_context *sctx)
+{
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+       unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
        /* 33% faster rendering to linear color buffers */
        bool dst_is_linear = sctx->framebuffer.any_dst_linear;
+       bool out_of_order_rast = si_out_of_order_rasterization(sctx);
        unsigned sc_mode_cntl_1 =
                S_028A4C_WALK_SIZE(dst_is_linear) |
                S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
                S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
+               S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
+               S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
                /* always 1: */
                S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
                S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
@@ -3030,11 +3348,119 @@ static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
                S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
                S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
                S_028A4C_FORCE_EOV_REZ_ENABLE(1);
+       unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
+                          S_028804_INCOHERENT_EQAA_READS(1) |
+                          S_028804_INTERPOLATE_COMP_Z(1) |
+                          S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
+       unsigned coverage_samples, color_samples, z_samples;
+
+       /* S: Coverage samples (up to 16x):
+        * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
+        * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
+        *
+        * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
+        * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
+        * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
+        * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
+        * # from the closest defined sample if Z is uncompressed (same quality as the number of
+        * # Z samples).
+        *
+        * F: Color samples (up to 8x, must be <= coverage samples):
+        * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
+        * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
+        *
+        * Can be anything between coverage and color samples:
+        * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
+        * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
+        * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
+        * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
+        * # All are currently set the same as coverage samples.
+        *
+        * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
+        * flag for undefined color samples. A shader-based resolve must handle unknowns
+        * or mask them out with AND. Unknowns can also be guessed from neighbors via
+        * an edge-detect shader-based resolve, which is required to make "color samples = 1"
+        * useful. The CB resolve always drops unknowns.
+        *
+        * Sensible AA configurations:
+        *   EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
+        *   EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
+        *   EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
+        *   EQAA  8s 8z 8f = 8x MSAA
+        *   EQAA  8s 8z 4f - might look the same as 8x MSAA
+        *   EQAA  8s 8z 2f - might look the same as 8x MSAA with low-density geometry
+        *   EQAA  8s 4z 4f - might look the same as 8x MSAA if Z is compressed
+        *   EQAA  8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
+        *   EQAA  4s 4z 4f = 4x MSAA
+        *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
+        *   EQAA  2s 2z 2f = 2x MSAA
+        */
+       if (sctx->framebuffer.nr_samples > 1) {
+               coverage_samples = sctx->framebuffer.nr_samples;
+               color_samples = sctx->framebuffer.nr_color_samples;
 
-       cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples,
-                               sctx->ps_iter_samples,
-                               sctx->smoothing_enabled ? SI_NUM_SMOOTH_AA_SAMPLES : 0,
-                               sc_mode_cntl_1);
+               if (sctx->framebuffer.state.zsbuf) {
+                       z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
+                       z_samples = MAX2(1, z_samples);
+               } else {
+                       z_samples = coverage_samples;
+               }
+       } else if (sctx->smoothing_enabled) {
+               coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+       } else {
+               coverage_samples = color_samples = z_samples = 1;
+       }
+
+       /* Required by OpenGL line rasterization.
+        *
+        * TODO: We should also enable perpendicular endcaps for AA lines,
+        *       but that requires implementing line stippling in the pixel
+        *       shader. SC can only do line stippling with axis-aligned
+        *       endcaps.
+        */
+       unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+       unsigned sc_aa_config = 0;
+
+       if (coverage_samples > 1) {
+               /* distance from the pixel center, indexed by log2(nr_samples) */
+               static unsigned max_dist[] = {
+                       0, /* unused */
+                       4, /* 2x MSAA */
+                       6, /* 4x MSAA */
+                       7, /* 8x MSAA */
+                       8, /* 16x MSAA */
+               };
+               unsigned log_samples = util_logbase2(coverage_samples);
+               unsigned log_z_samples = util_logbase2(z_samples);
+               unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
+               unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
+
+               sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
+               sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
+                              S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
+                              S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
+
+               if (sctx->framebuffer.nr_samples > 1) {
+                       db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
+                                  S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
+                                  S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
+                                  S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
+                       sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
+               } else if (sctx->smoothing_enabled) {
+                       db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
+               }
+       }
+
+       /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
+       radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL,
+                                   SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,
+                                   sc_aa_config);
+       /* R_028804_DB_EQAA */
+       radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,
+                                  db_eqaa);
+       /* R_028A4C_PA_SC_MODE_CNTL_1 */
+       radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
+                                  SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
 
        /* GFX9: Flush DFSM when the AA mode changes. */
        if (sctx->screen->dfsm_allowed) {
@@ -3043,18 +3469,28 @@ static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
        }
 }
 
+void si_update_ps_iter_samples(struct si_context *sctx)
+{
+       if (sctx->framebuffer.nr_samples > 1)
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+       if (sctx->screen->dpbb_allowed)
+               si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+}
+
 static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
+       /* The hardware can only do sample shading with 2^n samples. */
+       min_samples = util_next_power_of_two(min_samples);
+
        if (sctx->ps_iter_samples == min_samples)
                return;
 
        sctx->ps_iter_samples = min_samples;
        sctx->do_update_shaders = true;
 
-       if (sctx->framebuffer.nr_samples > 1)
-               si_mark_atom_dirty(sctx, &sctx->msaa_config);
+       si_update_ps_iter_samples(sctx);
 }
 
 /*
@@ -3080,8 +3516,8 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
        desc = util_format_description(format);
        first_non_void = util_format_get_first_non_void_channel(format);
        stride = desc->block.bits / 8;
-       num_format = si_translate_buffer_numformat(&screen->b.b, desc, first_non_void);
-       data_format = si_translate_buffer_dataformat(&screen->b.b, desc, first_non_void);
+       num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
+       data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
 
        num_records = size / stride;
        num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
@@ -3109,7 +3545,7 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
         * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
         * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
         */
-       if (screen->b.chip_class >= GFX9)
+       if (screen->info.chip_class >= GFX9)
                /* When vindex == 0, LLVM sets IDXEN = 0, thus changing units
                 * from STRIDE to bytes. This works around it by setting
                 * NUM_RECORDS to at least the size of one element, so that
@@ -3119,7 +3555,7 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
                 *       IDXEN is enforced?
                 */
                num_records = num_records ? MAX2(num_records, stride) : 0;
-       else if (screen->b.chip_class == VI)
+       else if (screen->info.chip_class == VI)
                num_records *= stride;
 
        state[4] = 0;
@@ -3178,15 +3614,18 @@ si_make_texture_descriptor(struct si_screen *screen,
                           uint32_t *state,
                           uint32_t *fmask_state)
 {
-       struct pipe_resource *res = &tex->resource.b.b;
+       struct pipe_resource *res = &tex->buffer.b.b;
        const struct util_format_description *desc;
        unsigned char swizzle[4];
        int first_non_void;
-       unsigned num_format, data_format, type;
+       unsigned num_format, data_format, type, num_samples;
        uint64_t va;
 
        desc = util_format_description(pipe_format);
 
+       num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ?
+                       MAX2(1, res->nr_samples) : tex->num_color_samples;
+
        if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
                const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
                const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
@@ -3204,7 +3643,10 @@ si_make_texture_descriptor(struct si_screen *screen,
                         * fix texture gathers. This affects at least
                         * GL45-CTS.texture_cube_map_array.sampling on VI.
                         */
-                       util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+                       if (screen->info.chip_class <= VI)
+                               util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
+                       else
+                               util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
                        break;
                default:
                        util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
@@ -3281,13 +3723,13 @@ si_make_texture_descriptor(struct si_screen *screen,
                }
        }
 
-       data_format = si_translate_texformat(&screen->b.b, pipe_format, desc, first_non_void);
+       data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
        if (data_format == ~0) {
                data_format = 0;
        }
 
        /* S8 with Z32 HTILE needs a special format. */
-       if (screen->b.chip_class >= GFX9 &&
+       if (screen->info.chip_class >= GFX9 &&
            pipe_format == PIPE_FORMAT_S8_UINT &&
            tex->tc_compatible_htile)
                data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
@@ -3295,7 +3737,7 @@ si_make_texture_descriptor(struct si_screen *screen,
        if (!sampler &&
            (res->target == PIPE_TEXTURE_CUBE ||
             res->target == PIPE_TEXTURE_CUBE_ARRAY ||
-            (screen->b.chip_class <= VI &&
+            (screen->info.chip_class <= VI &&
              res->target == PIPE_TEXTURE_3D))) {
                /* For the purpose of shader images, treat cube maps and 3D
                 * textures as 2D arrays. For 3D textures, the address
@@ -3306,7 +3748,7 @@ si_make_texture_descriptor(struct si_screen *screen,
 
                assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
        } else {
-               type = si_tex_dim(screen, tex, target, res->nr_samples);
+               type = si_tex_dim(screen, tex, target, num_samples);
        }
 
        if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
@@ -3329,10 +3771,9 @@ si_make_texture_descriptor(struct si_screen *screen,
                    S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
                    S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
                    S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-                   S_008F1C_BASE_LEVEL(res->nr_samples > 1 ?
-                                       0 : first_level) |
-                   S_008F1C_LAST_LEVEL(res->nr_samples > 1 ?
-                                       util_logbase2(res->nr_samples) :
+                   S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
+                   S_008F1C_LAST_LEVEL(num_samples > 1 ?
+                                       util_logbase2(num_samples) :
                                        last_level) |
                    S_008F1C_TYPE(type));
        state[4] = 0;
@@ -3340,7 +3781,7 @@ si_make_texture_descriptor(struct si_screen *screen,
        state[6] = 0;
        state[7] = 0;
 
-       if (screen->b.chip_class >= GFX9) {
+       if (screen->info.chip_class >= GFX9) {
                unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
 
                /* Depth is the the last accessible layer on Gfx9.
@@ -3352,9 +3793,9 @@ si_make_texture_descriptor(struct si_screen *screen,
                        state[4] |= S_008F20_DEPTH(last_layer);
 
                state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
-               state[5] |= S_008F24_MAX_MIP(res->nr_samples > 1 ?
-                                            util_logbase2(res->nr_samples) :
-                                            tex->resource.b.b.last_level);
+               state[5] |= S_008F24_MAX_MIP(num_samples > 1 ?
+                                            util_logbase2(num_samples) :
+                                            tex->buffer.b.b.last_level);
        } else {
                state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
                state[4] |= S_008F20_DEPTH(depth - 1);
@@ -3362,14 +3803,12 @@ si_make_texture_descriptor(struct si_screen *screen,
        }
 
        if (tex->dcc_offset) {
-               unsigned swap = r600_translate_colorswap(pipe_format, false);
-
-               state[6] = S_008F28_ALPHA_IS_ON_MSB(swap <= 1);
+               state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(pipe_format));
        } else {
                /* The last dword is unused by hw. The shader uses it to clear
                 * bits in the first dword of sampler state.
                 */
-               if (screen->b.chip_class <= CIK && res->nr_samples <= 1) {
+               if (screen->info.chip_class <= CIK && res->nr_samples <= 1) {
                        if (first_level == last_level)
                                state[7] = C_008F30_MAX_ANISO_RATIO;
                        else
@@ -3378,44 +3817,106 @@ si_make_texture_descriptor(struct si_screen *screen,
        }
 
        /* Initialize the sampler view for FMASK. */
-       if (tex->fmask.size) {
+       if (tex->surface.fmask_size) {
                uint32_t data_format, num_format;
 
-               va = tex->resource.gpu_address + tex->fmask.offset;
+               va = tex->buffer.gpu_address + tex->fmask_offset;
 
-               if (screen->b.chip_class >= GFX9) {
+#define FMASK(s,f) (((unsigned)(s) * 16) + (f))
+               if (screen->info.chip_class >= GFX9) {
                        data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
-                       switch (res->nr_samples) {
-                       case 2:
+                       switch (FMASK(res->nr_samples, tex->num_color_samples)) {
+                       case FMASK(2,1):
+                               num_format = V_008F14_IMG_FMASK_8_2_1;
+                               break;
+                       case FMASK(2,2):
                                num_format = V_008F14_IMG_FMASK_8_2_2;
                                break;
-                       case 4:
+                       case FMASK(4,1):
+                               num_format = V_008F14_IMG_FMASK_8_4_1;
+                               break;
+                       case FMASK(4,2):
+                               num_format = V_008F14_IMG_FMASK_8_4_2;
+                               break;
+                       case FMASK(4,4):
                                num_format = V_008F14_IMG_FMASK_8_4_4;
                                break;
-                       case 8:
+                       case FMASK(8,1):
+                               num_format = V_008F14_IMG_FMASK_8_8_1;
+                               break;
+                       case FMASK(8,2):
+                               num_format = V_008F14_IMG_FMASK_16_8_2;
+                               break;
+                       case FMASK(8,4):
+                               num_format = V_008F14_IMG_FMASK_32_8_4;
+                               break;
+                       case FMASK(8,8):
                                num_format = V_008F14_IMG_FMASK_32_8_8;
                                break;
+                       case FMASK(16,1):
+                               num_format = V_008F14_IMG_FMASK_16_16_1;
+                               break;
+                       case FMASK(16,2):
+                               num_format = V_008F14_IMG_FMASK_32_16_2;
+                               break;
+                       case FMASK(16,4):
+                               num_format = V_008F14_IMG_FMASK_64_16_4;
+                               break;
+                       case FMASK(16,8):
+                               num_format = V_008F14_IMG_FMASK_64_16_8;
+                               break;
                        default:
                                unreachable("invalid nr_samples");
                        }
                } else {
-                       switch (res->nr_samples) {
-                       case 2:
+                       switch (FMASK(res->nr_samples, tex->num_color_samples)) {
+                       case FMASK(2,1):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
+                               break;
+                       case FMASK(2,2):
                                data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
                                break;
-                       case 4:
+                       case FMASK(4,1):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
+                               break;
+                       case FMASK(4,2):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
+                               break;
+                       case FMASK(4,4):
                                data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
                                break;
-                       case 8:
+                       case FMASK(8,1):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
+                               break;
+                       case FMASK(8,2):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
+                               break;
+                       case FMASK(8,4):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
+                               break;
+                       case FMASK(8,8):
                                data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
                                break;
+                       case FMASK(16,1):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
+                               break;
+                       case FMASK(16,2):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
+                               break;
+                       case FMASK(16,4):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
+                               break;
+                       case FMASK(16,8):
+                               data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
+                               break;
                        default:
                                unreachable("invalid nr_samples");
                        }
                        num_format = V_008F14_IMG_NUM_FORMAT_UINT;
                }
+#undef FMASK
 
-               fmask_state[0] = (va >> 8) | tex->fmask.tile_swizzle;
+               fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
                fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
                                 S_008F14_DATA_FORMAT_GFX6(data_format) |
                                 S_008F14_NUM_FORMAT_GFX6(num_format);
@@ -3431,16 +3932,16 @@ si_make_texture_descriptor(struct si_screen *screen,
                fmask_state[6] = 0;
                fmask_state[7] = 0;
 
-               if (screen->b.chip_class >= GFX9) {
+               if (screen->info.chip_class >= GFX9) {
                        fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
                        fmask_state[4] |= S_008F20_DEPTH(last_layer) |
                                          S_008F20_PITCH_GFX9(tex->surface.u.gfx9.fmask.epitch);
                        fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
                                          S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
                } else {
-                       fmask_state[3] |= S_008F1C_TILING_INDEX(tex->fmask.tile_mode_index);
+                       fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
                        fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
-                                         S_008F20_PITCH_GFX6(tex->fmask.pitch_in_pixels - 1);
+                                         S_008F20_PITCH_GFX6(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
                        fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
                }
        }
@@ -3494,7 +3995,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
        /* Buffer resource. */
        if (texture->target == PIPE_BUFFER) {
                si_make_buffer_descriptor(sctx->screen,
-                                         (struct r600_resource *)texture,
+                                         r600_resource(texture),
                                          state->format,
                                          state->u.buf.offset,
                                          state->u.buf.size,
@@ -3514,7 +4015,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
        height = height0;
        depth = texture->depth0;
 
-       if (sctx->b.chip_class <= VI && force_level) {
+       if (sctx->chip_class <= VI && force_level) {
                assert(force_level == first_level &&
                       force_level == last_level);
                base_level = force_level;
@@ -3536,9 +4037,9 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
        pipe_format = state->format;
 
        /* Depth/stencil texturing sometimes needs separate texture. */
-       if (tmp->is_depth && !r600_can_sample_zs(tmp, view->is_stencil_sampler)) {
+       if (tmp->is_depth && !si_can_sample_zs(tmp, view->is_stencil_sampler)) {
                if (!tmp->flushed_depth_texture &&
-                   !r600_init_flushed_depth_texture(ctx, texture, NULL)) {
+                   !si_init_flushed_depth_texture(ctx, texture, NULL)) {
                        pipe_resource_reference(&view->base.texture, NULL);
                        FREE(view);
                        return NULL;
@@ -3549,8 +4050,8 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
                /* Override format for the case where the flushed texture
                 * contains only Z or only S.
                 */
-               if (tmp->flushed_depth_texture->resource.b.b.format != tmp->resource.b.b.format)
-                       pipe_format = tmp->flushed_depth_texture->resource.b.b.format;
+               if (tmp->flushed_depth_texture->buffer.b.b.format != tmp->buffer.b.b.format)
+                       pipe_format = tmp->flushed_depth_texture->buffer.b.b.format;
 
                tmp = tmp->flushed_depth_texture;
        }
@@ -3594,6 +4095,12 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
                                   width, height, depth,
                                   view->state, view->fmask_state);
 
+       unsigned num_format = G_008F14_NUM_FORMAT_GFX6(view->state[1]);
+       view->is_integer =
+               num_format == V_008F14_IMG_NUM_FORMAT_USCALED ||
+               num_format == V_008F14_IMG_NUM_FORMAT_SSCALED ||
+               num_format == V_008F14_IMG_NUM_FORMAT_UINT ||
+               num_format == V_008F14_IMG_NUM_FORMAT_SINT;
        view->base_level_info = &surflevel[base_level];
        view->base_level = base_level;
        view->block_width = util_format_get_blockwidth(pipe_format);
@@ -3628,82 +4135,111 @@ static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
                 wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
 }
 
-static bool sampler_state_needs_border_color(const struct pipe_sampler_state *state)
+static uint32_t si_translate_border_color(struct si_context *sctx,
+                                         const struct pipe_sampler_state *state,
+                                         const union pipe_color_union *color,
+                                         bool is_integer)
 {
        bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
                             state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
 
-       return (state->border_color.ui[0] || state->border_color.ui[1] ||
-               state->border_color.ui[2] || state->border_color.ui[3]) &&
-              (wrap_mode_uses_border_color(state->wrap_s, linear_filter) ||
-               wrap_mode_uses_border_color(state->wrap_t, linear_filter) ||
-               wrap_mode_uses_border_color(state->wrap_r, linear_filter));
+       if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
+           !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
+           !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
+               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+
+#define simple_border_types(elt) \
+do { \
+       if (color->elt[0] == 0 && color->elt[1] == 0 &&                         \
+           color->elt[2] == 0 && color->elt[3] == 0)                           \
+               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \
+       if (color->elt[0] == 0 && color->elt[1] == 0 &&                         \
+           color->elt[2] == 0 && color->elt[3] == 1)                           \
+               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \
+       if (color->elt[0] == 1 && color->elt[1] == 1 &&                         \
+           color->elt[2] == 1 && color->elt[3] == 1)                           \
+               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \
+} while (false)
+
+       if (is_integer)
+               simple_border_types(ui);
+       else
+               simple_border_types(f);
+
+#undef simple_border_types
+
+       int i;
+
+       /* Check if the border has been uploaded already. */
+       for (i = 0; i < sctx->border_color_count; i++)
+               if (memcmp(&sctx->border_color_table[i], color,
+                          sizeof(*color)) == 0)
+                       break;
+
+       if (i >= SI_MAX_BORDER_COLORS) {
+               /* Getting 4096 unique border colors is very unlikely. */
+               fprintf(stderr, "radeonsi: The border color table is full. "
+                       "Any new border colors will be just black. "
+                       "Please file a bug.\n");
+               return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
+       }
+
+       if (i == sctx->border_color_count) {
+               /* Upload a new border color. */
+               memcpy(&sctx->border_color_table[i], color,
+                      sizeof(*color));
+               util_memcpy_cpu_to_le32(&sctx->border_color_map[i],
+                                       color, sizeof(*color));
+               sctx->border_color_count++;
+       }
+
+       return S_008F3C_BORDER_COLOR_PTR(i) |
+              S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
+}
+
+static inline int S_FIXED(float value, unsigned frac_bits)
+{
+       return value * (1 << frac_bits);
+}
+
+static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso)
+{
+       if (filter == PIPE_TEX_FILTER_LINEAR)
+               return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
+                                    : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
+       else
+               return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
+                                    : V_008F38_SQ_TEX_XY_FILTER_POINT;
+}
+
+static inline unsigned si_tex_aniso_filter(unsigned filter)
+{
+       if (filter < 2)
+               return 0;
+       if (filter < 4)
+               return 1;
+       if (filter < 8)
+               return 2;
+       if (filter < 16)
+               return 3;
+       return 4;
 }
 
 static void *si_create_sampler_state(struct pipe_context *ctx,
                                     const struct pipe_sampler_state *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
-       struct r600_common_screen *rscreen = sctx->b.screen;
+       struct si_screen *sscreen = sctx->screen;
        struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
-       unsigned border_color_type, border_color_index = 0;
-       unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso
+       unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso
                                                       : state->max_anisotropy;
-       unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso);
+       unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
+       union pipe_color_union clamped_border_color;
 
        if (!rstate) {
                return NULL;
        }
 
-       if (!sampler_state_needs_border_color(state))
-               border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK;
-       else if (state->border_color.f[0] == 0 &&
-                state->border_color.f[1] == 0 &&
-                state->border_color.f[2] == 0 &&
-                state->border_color.f[3] == 0)
-               border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK;
-       else if (state->border_color.f[0] == 0 &&
-                state->border_color.f[1] == 0 &&
-                state->border_color.f[2] == 0 &&
-                state->border_color.f[3] == 1)
-               border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK;
-       else if (state->border_color.f[0] == 1 &&
-                state->border_color.f[1] == 1 &&
-                state->border_color.f[2] == 1 &&
-                state->border_color.f[3] == 1)
-               border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE;
-       else {
-               int i;
-
-               border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER;
-
-               /* Check if the border has been uploaded already. */
-               for (i = 0; i < sctx->border_color_count; i++)
-                       if (memcmp(&sctx->border_color_table[i], &state->border_color,
-                                  sizeof(state->border_color)) == 0)
-                               break;
-
-               if (i >= SI_MAX_BORDER_COLORS) {
-                       /* Getting 4096 unique border colors is very unlikely. */
-                       fprintf(stderr, "radeonsi: The border color table is full. "
-                               "Any new border colors will be just black. "
-                               "Please file a bug.\n");
-                       border_color_type = V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK;
-               } else {
-                       if (i == sctx->border_color_count) {
-                               /* Upload a new border color. */
-                               memcpy(&sctx->border_color_table[i], &state->border_color,
-                                      sizeof(state->border_color));
-                               util_memcpy_cpu_to_le32(&sctx->border_color_map[i],
-                                                       &state->border_color,
-                                                       sizeof(state->border_color));
-                               sctx->border_color_count++;
-                       }
-
-                       border_color_index = i;
-               }
-       }
-
 #ifdef DEBUG
        rstate->magic = SI_SAMPLER_STATE_MAGIC;
 #endif
@@ -3716,20 +4252,40 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
                          S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) |
                          S_008F30_ANISO_BIAS(max_aniso_ratio) |
                          S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
-                         S_008F30_COMPAT_MODE(sctx->b.chip_class >= VI));
+                         S_008F30_COMPAT_MODE(sctx->chip_class >= VI));
        rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
                          S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
                          S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
        rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
-                         S_008F38_XY_MAG_FILTER(eg_tex_filter(state->mag_img_filter, max_aniso)) |
-                         S_008F38_XY_MIN_FILTER(eg_tex_filter(state->min_img_filter, max_aniso)) |
+                         S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
+                         S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
                          S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
-                         S_008F38_MIP_POINT_PRECLAMP(1) |
-                         S_008F38_DISABLE_LSB_CEIL(sctx->b.chip_class <= VI) |
+                         S_008F38_MIP_POINT_PRECLAMP(0) |
+                         S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= VI) |
                          S_008F38_FILTER_PREC_FIX(1) |
-                         S_008F38_ANISO_OVERRIDE(sctx->b.chip_class >= VI));
-       rstate->val[3] = S_008F3C_BORDER_COLOR_PTR(border_color_index) |
-                        S_008F3C_BORDER_COLOR_TYPE(border_color_type);
+                         S_008F38_ANISO_OVERRIDE(sctx->chip_class >= VI));
+       rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
+
+       /* Create sampler resource for integer textures. */
+       memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
+       rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
+
+       /* Create sampler resource for upgraded depth textures. */
+       memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
+
+       for (unsigned i = 0; i < 4; ++i) {
+               /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
+                * when the border color is 1.0. */
+               clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
+       }
+
+       if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0)
+               rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
+       else
+               rstate->upgraded_depth_val[3] =
+                       si_translate_border_color(sctx, state, &clamped_border_color, false) |
+                       S_008F3C_UPGRADED_DEPTH(1);
+
        return rstate;
 }
 
@@ -3737,17 +4293,17 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
-       if (sctx->sample_mask.sample_mask == (uint16_t)sample_mask)
+       if (sctx->sample_mask == (uint16_t)sample_mask)
                return;
 
-       sctx->sample_mask.sample_mask = sample_mask;
-       si_mark_atom_dirty(sctx, &sctx->sample_mask.atom);
+       sctx->sample_mask = sample_mask;
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
 }
 
-static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_sample_mask(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-       unsigned mask = sctx->sample_mask.sample_mask;
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+       unsigned mask = sctx->sample_mask;
 
        /* Needed for line and polygon smoothing as well as for the Polaris
         * small primitive filter. We expect the state tracker to take care of
@@ -3836,8 +4392,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                 * chips are VI and older except Stoney (GFX8.1).
                 */
                if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 &&
-                   sscreen->b.chip_class <= VI &&
-                   sscreen->b.family != CHIP_STONEY) {
+                   sscreen->info.chip_class <= VI &&
+                   sscreen->info.family != CHIP_STONEY) {
                        if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
                                v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM;
                        } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) {
@@ -3981,7 +4537,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
                        pipe_resource_reference(&dsti->buffer.resource, buf);
                        dsti->buffer_offset = src->buffer_offset;
                        dsti->stride = src->stride;
-                       r600_context_add_resource_size(ctx, buf);
+                       si_context_add_resource_size(sctx, buf);
                        if (buf)
                                r600_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
                }
@@ -4027,8 +4583,7 @@ static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
        si_update_fb_dirtiness_after_rendering(sctx);
 
        /* Multisample surfaces are flushed in si_decompress_textures. */
-       if (sctx->framebuffer.nr_samples <= 1 &&
-           sctx->framebuffer.state.nr_cbufs)
+       if (sctx->framebuffer.uncompressed_cb_mask)
                si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
                                           sctx->framebuffer.CB_has_shader_readable_metadata);
 }
@@ -4040,11 +4595,11 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
        /* Subsequent commands must wait for all shader invocations to
         * complete. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                         SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
-               sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+               sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
                                 SI_CONTEXT_INV_VMEM_L1;
 
        if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
@@ -4056,33 +4611,32 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
                /* As far as I can tell, L1 contents are written back to L2
                 * automatically at end of shader, but the contents of other
                 * L1 caches might still be stale. */
-               sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1;
+               sctx->flags |= SI_CONTEXT_INV_VMEM_L1;
        }
 
        if (flags & PIPE_BARRIER_INDEX_BUFFER) {
                /* Indices are read through TC L2 since VI.
                 * L1 isn't used.
                 */
-               if (sctx->screen->b.chip_class <= CIK)
-                       sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+               if (sctx->screen->info.chip_class <= CIK)
+                       sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
        }
 
        /* MSAA color, any depth and any stencil are flushed in
         * si_decompress_textures when needed.
         */
        if (flags & PIPE_BARRIER_FRAMEBUFFER &&
-           sctx->framebuffer.nr_samples <= 1 &&
-           sctx->framebuffer.state.nr_cbufs) {
-               sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+           sctx->framebuffer.uncompressed_cb_mask) {
+               sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
 
-               if (sctx->b.chip_class <= VI)
-                       sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+               if (sctx->chip_class <= VI)
+                       sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
        }
 
        /* Indirect buffers use TC L2 on GFX9, but not older hw. */
-       if (sctx->screen->b.chip_class <= VI &&
+       if (sctx->screen->info.chip_class <= VI &&
            flags & PIPE_BARRIER_INDIRECT_BUFFER)
-               sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+               sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 }
 
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
@@ -4092,48 +4646,37 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
        memset(&blend, 0, sizeof(blend));
        blend.independent_blend_enable = true;
        blend.rt[0].colormask = 0xf;
-       return si_create_blend_state_mode(&sctx->b.b, &blend, mode);
-}
-
-static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
-                                bool include_draw_vbo)
-{
-       si_need_cs_space((struct si_context*)ctx);
+       return si_create_blend_state_mode(&sctx->b, &blend, mode);
 }
 
 static void si_init_config(struct si_context *sctx);
 
 void si_init_state_functions(struct si_context *sctx)
 {
-       si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond);
-       si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin);
-       si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable);
-       si_init_external_atom(sctx, &sctx->b.scissors.atom, &sctx->atoms.s.scissors);
-       si_init_external_atom(sctx, &sctx->b.viewports.atom, &sctx->atoms.s.viewports);
-
-       si_init_atom(sctx, &sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state);
-       si_init_atom(sctx, &sctx->msaa_sample_locs.atom, &sctx->atoms.s.msaa_sample_locs, si_emit_msaa_sample_locs);
-       si_init_atom(sctx, &sctx->db_render_state, &sctx->atoms.s.db_render_state, si_emit_db_render_state);
-       si_init_atom(sctx, &sctx->msaa_config, &sctx->atoms.s.msaa_config, si_emit_msaa_config);
-       si_init_atom(sctx, &sctx->sample_mask.atom, &sctx->atoms.s.sample_mask, si_emit_sample_mask);
-       si_init_atom(sctx, &sctx->cb_render_state, &sctx->atoms.s.cb_render_state, si_emit_cb_render_state);
-       si_init_atom(sctx, &sctx->blend_color.atom, &sctx->atoms.s.blend_color, si_emit_blend_color);
-       si_init_atom(sctx, &sctx->clip_regs, &sctx->atoms.s.clip_regs, si_emit_clip_regs);
-       si_init_atom(sctx, &sctx->clip_state.atom, &sctx->atoms.s.clip_state, si_emit_clip_state);
-       si_init_atom(sctx, &sctx->stencil_ref.atom, &sctx->atoms.s.stencil_ref, si_emit_stencil_ref);
-
-       sctx->b.b.create_blend_state = si_create_blend_state;
-       sctx->b.b.bind_blend_state = si_bind_blend_state;
-       sctx->b.b.delete_blend_state = si_delete_blend_state;
-       sctx->b.b.set_blend_color = si_set_blend_color;
-
-       sctx->b.b.create_rasterizer_state = si_create_rs_state;
-       sctx->b.b.bind_rasterizer_state = si_bind_rs_state;
-       sctx->b.b.delete_rasterizer_state = si_delete_rs_state;
-
-       sctx->b.b.create_depth_stencil_alpha_state = si_create_dsa_state;
-       sctx->b.b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
-       sctx->b.b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
+       sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
+       sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
+       sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
+       sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
+       sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
+       sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
+       sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
+       sctx->atoms.s.blend_color.emit = si_emit_blend_color;
+       sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
+       sctx->atoms.s.clip_state.emit = si_emit_clip_state;
+       sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
+
+       sctx->b.create_blend_state = si_create_blend_state;
+       sctx->b.bind_blend_state = si_bind_blend_state;
+       sctx->b.delete_blend_state = si_delete_blend_state;
+       sctx->b.set_blend_color = si_set_blend_color;
+
+       sctx->b.create_rasterizer_state = si_create_rs_state;
+       sctx->b.bind_rasterizer_state = si_bind_rs_state;
+       sctx->b.delete_rasterizer_state = si_delete_rs_state;
+
+       sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
+       sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
+       sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
 
        sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
        sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
@@ -4141,155 +4684,53 @@ void si_init_state_functions(struct si_context *sctx)
        sctx->custom_blend_eliminate_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
        sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
 
-       sctx->b.b.set_clip_state = si_set_clip_state;
-       sctx->b.b.set_stencil_ref = si_set_stencil_ref;
+       sctx->b.set_clip_state = si_set_clip_state;
+       sctx->b.set_stencil_ref = si_set_stencil_ref;
 
-       sctx->b.b.set_framebuffer_state = si_set_framebuffer_state;
-       sctx->b.b.get_sample_position = cayman_get_sample_position;
+       sctx->b.set_framebuffer_state = si_set_framebuffer_state;
 
-       sctx->b.b.create_sampler_state = si_create_sampler_state;
-       sctx->b.b.delete_sampler_state = si_delete_sampler_state;
+       sctx->b.create_sampler_state = si_create_sampler_state;
+       sctx->b.delete_sampler_state = si_delete_sampler_state;
 
-       sctx->b.b.create_sampler_view = si_create_sampler_view;
-       sctx->b.b.sampler_view_destroy = si_sampler_view_destroy;
+       sctx->b.create_sampler_view = si_create_sampler_view;
+       sctx->b.sampler_view_destroy = si_sampler_view_destroy;
 
-       sctx->b.b.set_sample_mask = si_set_sample_mask;
+       sctx->b.set_sample_mask = si_set_sample_mask;
 
-       sctx->b.b.create_vertex_elements_state = si_create_vertex_elements;
-       sctx->b.b.bind_vertex_elements_state = si_bind_vertex_elements;
-       sctx->b.b.delete_vertex_elements_state = si_delete_vertex_element;
-       sctx->b.b.set_vertex_buffers = si_set_vertex_buffers;
+       sctx->b.create_vertex_elements_state = si_create_vertex_elements;
+       sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
+       sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
+       sctx->b.set_vertex_buffers = si_set_vertex_buffers;
 
-       sctx->b.b.texture_barrier = si_texture_barrier;
-       sctx->b.b.memory_barrier = si_memory_barrier;
-       sctx->b.b.set_min_samples = si_set_min_samples;
-       sctx->b.b.set_tess_state = si_set_tess_state;
+       sctx->b.texture_barrier = si_texture_barrier;
+       sctx->b.memory_barrier = si_memory_barrier;
+       sctx->b.set_min_samples = si_set_min_samples;
+       sctx->b.set_tess_state = si_set_tess_state;
 
-       sctx->b.b.set_active_query_state = si_set_active_query_state;
-       sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
-       sctx->b.save_qbo_state = si_save_qbo_state;
-       sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
+       sctx->b.set_active_query_state = si_set_active_query_state;
 
-       sctx->b.b.draw_vbo = si_draw_vbo;
+       sctx->b.draw_vbo = si_draw_vbo;
 
        si_init_config(sctx);
 }
 
-static uint32_t si_get_bo_metadata_word1(struct r600_common_screen *rscreen)
-{
-       return (ATI_VENDOR_ID << 16) | rscreen->info.pci_id;
-}
-
-static void si_query_opaque_metadata(struct r600_common_screen *rscreen,
-                                    struct r600_texture *rtex,
-                                    struct radeon_bo_metadata *md)
-{
-       struct si_screen *sscreen = (struct si_screen*)rscreen;
-       struct pipe_resource *res = &rtex->resource.b.b;
-       static const unsigned char swizzle[] = {
-               PIPE_SWIZZLE_X,
-               PIPE_SWIZZLE_Y,
-               PIPE_SWIZZLE_Z,
-               PIPE_SWIZZLE_W
-       };
-       uint32_t desc[8], i;
-       bool is_array = util_resource_is_array_texture(res);
-
-       /* DRM 2.x.x doesn't support this. */
-       if (rscreen->info.drm_major != 3)
-               return;
-
-       assert(rtex->dcc_separate_buffer == NULL);
-       assert(rtex->fmask.size == 0);
-
-       /* Metadata image format format version 1:
-        * [0] = 1 (metadata format identifier)
-        * [1] = (VENDOR_ID << 16) | PCI_ID
-        * [2:9] = image descriptor for the whole resource
-        *         [2] is always 0, because the base address is cleared
-        *         [9] is the DCC offset bits [39:8] from the beginning of
-        *             the buffer
-        * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
-        */
-
-       md->metadata[0] = 1; /* metadata image format version 1 */
-
-       /* TILE_MODE_INDEX is ambiguous without a PCI ID. */
-       md->metadata[1] = si_get_bo_metadata_word1(rscreen);
-
-       si_make_texture_descriptor(sscreen, rtex, true,
-                                  res->target, res->format,
-                                  swizzle, 0, res->last_level, 0,
-                                  is_array ? res->array_size - 1 : 0,
-                                  res->width0, res->height0, res->depth0,
-                                  desc, NULL);
-
-       si_set_mutable_tex_desc_fields(sscreen, rtex, &rtex->surface.u.legacy.level[0],
-                                      0, 0, rtex->surface.blk_w, false, desc);
-
-       /* Clear the base address and set the relative DCC offset. */
-       desc[0] = 0;
-       desc[1] &= C_008F14_BASE_ADDRESS_HI;
-       desc[7] = rtex->dcc_offset >> 8;
-
-       /* Dwords [2:9] contain the image descriptor. */
-       memcpy(&md->metadata[2], desc, sizeof(desc));
-       md->size_metadata = 10 * 4;
-
-       /* Dwords [10:..] contain the mipmap level offsets. */
-       if (rscreen->chip_class <= VI) {
-               for (i = 0; i <= res->last_level; i++)
-                       md->metadata[10+i] = rtex->surface.u.legacy.level[i].offset >> 8;
-
-               md->size_metadata += (1 + res->last_level) * 4;
-       }
-}
-
-static void si_apply_opaque_metadata(struct r600_common_screen *rscreen,
-                                    struct r600_texture *rtex,
-                                    struct radeon_bo_metadata *md)
-{
-       uint32_t *desc = &md->metadata[2];
-
-       if (rscreen->chip_class < VI)
-               return;
-
-       /* Return if DCC is enabled. The texture should be set up with it
-        * already.
-        */
-       if (md->size_metadata >= 11 * 4 &&
-           md->metadata[0] != 0 &&
-           md->metadata[1] == si_get_bo_metadata_word1(rscreen) &&
-           G_008F28_COMPRESSION_EN(desc[6])) {
-               assert(rtex->dcc_offset == ((uint64_t)desc[7] << 8));
-               return;
-       }
-
-       /* Disable DCC. These are always set by texture_from_handle and must
-        * be cleared here.
-        */
-       rtex->dcc_offset = 0;
-}
-
 void si_init_screen_state_functions(struct si_screen *sscreen)
 {
-       sscreen->b.b.is_format_supported = si_is_format_supported;
-       sscreen->b.query_opaque_metadata = si_query_opaque_metadata;
-       sscreen->b.apply_opaque_metadata = si_apply_opaque_metadata;
+       sscreen->b.is_format_supported = si_is_format_supported;
 }
 
 static void si_set_grbm_gfx_index(struct si_context *sctx,
                                  struct si_pm4_state *pm4,  unsigned value)
 {
-       unsigned reg = sctx->b.chip_class >= CIK ? R_030800_GRBM_GFX_INDEX :
-                                                  GRBM_GFX_INDEX;
+       unsigned reg = sctx->chip_class >= CIK ? R_030800_GRBM_GFX_INDEX :
+                                                  R_00802C_GRBM_GFX_INDEX;
        si_pm4_set_reg(pm4, reg, value);
 }
 
 static void si_set_grbm_gfx_index_se(struct si_context *sctx,
                                     struct si_pm4_state *pm4, unsigned se)
 {
-       assert(se == ~0 || se < sctx->screen->b.info.max_se);
+       assert(se == ~0 || se < sctx->screen->info.max_se);
        si_set_grbm_gfx_index(sctx, pm4,
                              (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) :
                                          S_030800_SE_INDEX(se)) |
@@ -4303,209 +4744,35 @@ si_write_harvested_raster_configs(struct si_context *sctx,
                                  unsigned raster_config,
                                  unsigned raster_config_1)
 {
-       unsigned sh_per_se = MAX2(sctx->screen->b.info.max_sh_per_se, 1);
-       unsigned num_se = MAX2(sctx->screen->b.info.max_se, 1);
-       unsigned rb_mask = sctx->screen->b.info.enabled_rb_mask;
-       unsigned num_rb = MIN2(sctx->screen->b.info.num_render_backends, 16);
-       unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
-       unsigned rb_per_se = num_rb / num_se;
-       unsigned se_mask[4];
+       unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
+       unsigned raster_config_se[4];
        unsigned se;
 
-       se_mask[0] = ((1 << rb_per_se) - 1);
-       se_mask[1] = (se_mask[0] << rb_per_se);
-       se_mask[2] = (se_mask[1] << rb_per_se);
-       se_mask[3] = (se_mask[2] << rb_per_se);
-
-       se_mask[0] &= rb_mask;
-       se_mask[1] &= rb_mask;
-       se_mask[2] &= rb_mask;
-       se_mask[3] &= rb_mask;
-
-       assert(num_se == 1 || num_se == 2 || num_se == 4);
-       assert(sh_per_se == 1 || sh_per_se == 2);
-       assert(rb_per_pkr == 1 || rb_per_pkr == 2);
-
-       /* XXX: I can't figure out what the *_XSEL and *_YSEL
-        * fields are for, so I'm leaving them as their default
-        * values. */
+       ac_get_harvested_configs(&sctx->screen->info,
+                                raster_config,
+                                &raster_config_1,
+                                raster_config_se);
 
        for (se = 0; se < num_se; se++) {
-               unsigned raster_config_se = raster_config;
-               unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
-               unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
-               int idx = (se / 2) * 2;
-
-               if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
-                       raster_config_se &= C_028350_SE_MAP;
-
-                       if (!se_mask[idx]) {
-                               raster_config_se |=
-                                       S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
-                       } else {
-                               raster_config_se |=
-                                       S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
-                       }
-               }
-
-               pkr0_mask &= rb_mask;
-               pkr1_mask &= rb_mask;
-               if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
-                       raster_config_se &= C_028350_PKR_MAP;
-
-                       if (!pkr0_mask) {
-                               raster_config_se |=
-                                       S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
-                       } else {
-                               raster_config_se |=
-                                       S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
-                       }
-               }
-
-               if (rb_per_se >= 2) {
-                       unsigned rb0_mask = 1 << (se * rb_per_se);
-                       unsigned rb1_mask = rb0_mask << 1;
-
-                       rb0_mask &= rb_mask;
-                       rb1_mask &= rb_mask;
-                       if (!rb0_mask || !rb1_mask) {
-                               raster_config_se &= C_028350_RB_MAP_PKR0;
-
-                               if (!rb0_mask) {
-                                       raster_config_se |=
-                                               S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
-                               } else {
-                                       raster_config_se |=
-                                               S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
-                               }
-                       }
-
-                       if (rb_per_se > 2) {
-                               rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
-                               rb1_mask = rb0_mask << 1;
-                               rb0_mask &= rb_mask;
-                               rb1_mask &= rb_mask;
-                               if (!rb0_mask || !rb1_mask) {
-                                       raster_config_se &= C_028350_RB_MAP_PKR1;
-
-                                       if (!rb0_mask) {
-                                               raster_config_se |=
-                                                       S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
-                                       } else {
-                                               raster_config_se |=
-                                                       S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
-                                       }
-                               }
-                       }
-               }
-
                si_set_grbm_gfx_index_se(sctx, pm4, se);
-               si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
+               si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
        }
        si_set_grbm_gfx_index(sctx, pm4, ~0);
 
-       if (sctx->b.chip_class >= CIK) {
-               if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
-                                    (!se_mask[2] && !se_mask[3]))) {
-                       raster_config_1 &= C_028354_SE_PAIR_MAP;
-
-                       if (!se_mask[0] && !se_mask[1]) {
-                               raster_config_1 |=
-                                       S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
-                       } else {
-                               raster_config_1 |=
-                                       S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
-                       }
-               }
-
+       if (sctx->chip_class >= CIK) {
                si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
        }
 }
 
 static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4)
 {
-       struct si_screen *sscreen = sctx->screen;
-       unsigned num_rb = MIN2(sctx->screen->b.info.num_render_backends, 16);
-       unsigned rb_mask = sctx->screen->b.info.enabled_rb_mask;
+       unsigned num_rb = MIN2(sctx->screen->info.num_render_backends, 16);
+       unsigned rb_mask = sctx->screen->info.enabled_rb_mask;
        unsigned raster_config, raster_config_1;
 
-       switch (sctx->b.family) {
-       case CHIP_TAHITI:
-       case CHIP_PITCAIRN:
-               raster_config = 0x2a00126a;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_VERDE:
-               raster_config = 0x0000124a;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_OLAND:
-               raster_config = 0x00000082;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_HAINAN:
-               raster_config = 0x00000000;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_BONAIRE:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_HAWAII:
-               raster_config = 0x3a00161a;
-               raster_config_1 = 0x0000002e;
-               break;
-       case CHIP_FIJI:
-               if (sscreen->b.info.cik_macrotile_mode_array[0] == 0x000000e8) {
-                       /* old kernels with old tiling config */
-                       raster_config = 0x16000012;
-                       raster_config_1 = 0x0000002a;
-               } else {
-                       raster_config = 0x3a00161a;
-                       raster_config_1 = 0x0000002e;
-               }
-               break;
-       case CHIP_POLARIS10:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x0000002a;
-               break;
-       case CHIP_POLARIS11:
-       case CHIP_POLARIS12:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_TONGA:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x0000002a;
-               break;
-       case CHIP_ICELAND:
-               if (num_rb == 1)
-                       raster_config = 0x00000000;
-               else
-                       raster_config = 0x00000002;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_CARRIZO:
-               raster_config = 0x00000002;
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_KAVERI:
-               /* KV should be 0x00000002, but that causes problems with radeon */
-               raster_config = 0x00000000; /* 0x00000002 */
-               raster_config_1 = 0x00000000;
-               break;
-       case CHIP_KABINI:
-       case CHIP_MULLINS:
-       case CHIP_STONEY:
-               raster_config = 0x00000000;
-               raster_config_1 = 0x00000000;
-               break;
-       default:
-               fprintf(stderr,
-                       "radeonsi: Unknown GPU, using 0 for raster_config\n");
-               raster_config = 0x00000000;
-               raster_config_1 = 0x00000000;
-       }
+       ac_get_raster_config(&sctx->screen->info,
+                            &raster_config,
+                            &raster_config_1);
 
        if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
                /* Always use the default config when all backends are enabled
@@ -4513,7 +4780,7 @@ static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *p
                 */
                si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
                               raster_config);
-               if (sctx->b.chip_class >= CIK)
+               if (sctx->chip_class >= CIK)
                        si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1,
                                       raster_config_1);
        } else {
@@ -4529,7 +4796,7 @@ static void si_init_config(struct si_context *sctx)
        struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
        /* Only SI can disable CLEAR_STATE for now. */
-       assert(has_clear_state || sscreen->b.chip_class == SI);
+       assert(has_clear_state || sscreen->info.chip_class == SI);
 
        if (!pm4)
                return;
@@ -4545,7 +4812,7 @@ static void si_init_config(struct si_context *sctx)
                si_pm4_cmd_end(pm4, false);
        }
 
-       if (sctx->b.chip_class <= VI)
+       if (sctx->chip_class <= VI)
                si_set_raster_config(sctx, pm4);
 
        si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
@@ -4553,7 +4820,7 @@ static void si_init_config(struct si_context *sctx)
                si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
        /* FIXME calculate these values somehow ??? */
-       if (sctx->b.chip_class <= VI) {
+       if (sctx->chip_class <= VI) {
                si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
                si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
        }
@@ -4567,20 +4834,17 @@ static void si_init_config(struct si_context *sctx)
        si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
        if (!has_clear_state)
                si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
-       if (sctx->b.chip_class < CIK)
+       if (sctx->chip_class < CIK)
                si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
                               S_008A14_CLIP_VTX_REORDER_ENA(1));
 
-       si_pm4_set_reg(pm4, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210);
-       si_pm4_set_reg(pm4, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98);
-
        if (!has_clear_state)
                si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
 
        /* CLEAR_STATE doesn't clear these correctly on certain generations.
         * I don't know why. Deduced by trial and error.
         */
-       if (sctx->b.chip_class <= CIK) {
+       if (sctx->chip_class <= CIK) {
                si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
                si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
                si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
@@ -4611,7 +4875,7 @@ static void si_init_config(struct si_context *sctx)
                si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
        }
 
-       if (sctx->b.chip_class >= GFX9) {
+       if (sctx->chip_class >= GFX9) {
                si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
                si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
                si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
@@ -4625,13 +4889,17 @@ static void si_init_config(struct si_context *sctx)
                si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
        }
 
-       if (sctx->b.chip_class >= CIK) {
-               if (sctx->b.chip_class >= GFX9) {
-                       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff));
+       if (sctx->chip_class >= CIK) {
+               if (sctx->chip_class >= GFX9) {
+                       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                                      S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
                } else {
-                       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
-                       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
-                       si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
+                       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+                                      S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+                       si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+                                      S_00B41C_WAVE_LIMIT(0x3F));
+                       si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+                                      S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
 
                        /* If this is 0, Bonaire can hang even if GS isn't being used.
                         * Other chips are unaffected. These are suboptimal values,
@@ -4641,15 +4909,16 @@ static void si_init_config(struct si_context *sctx)
                                       S_028A44_ES_VERTS_PER_SUBGRP(64) |
                                       S_028A44_GS_PRIMS_PER_SUBGRP(4));
                }
-               si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
+               si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                              S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
 
                /* Compute LATE_ALLOC_VS.LIMIT. */
-               unsigned num_cu_per_sh = sscreen->b.info.num_good_compute_units /
-                                        (sscreen->b.info.max_se *
-                                         sscreen->b.info.max_sh_per_se);
+               unsigned num_cu_per_sh = sscreen->info.num_good_compute_units /
+                                        (sscreen->info.max_se *
+                                         sscreen->info.max_sh_per_se);
                unsigned late_alloc_limit; /* The limit is per SH. */
 
-               if (sctx->b.family == CHIP_KABINI) {
+               if (sctx->family == CHIP_KABINI) {
                        late_alloc_limit = 0; /* Potential hang on Kabini. */
                } else if (num_cu_per_sh <= 4) {
                        /* Too few available compute units per SH. Disallowing
@@ -4673,19 +4942,17 @@ static void si_init_config(struct si_context *sctx)
 
                /* VS can't execute on one CU if the limit is > 2. */
                si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-                              S_00B118_CU_EN(late_alloc_limit > 2 ? 0xfffe : 0xffff));
+                              S_00B118_CU_EN(late_alloc_limit > 2 ? 0xfffe : 0xffff) |
+                              S_00B118_WAVE_LIMIT(0x3F));
                si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
                               S_00B11C_LIMIT(late_alloc_limit));
-               si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
+               si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+                              S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
        }
 
-       if (sctx->b.chip_class >= VI) {
+       if (sctx->chip_class >= VI) {
                unsigned vgt_tess_distribution;
 
-               si_pm4_set_reg(pm4, R_028424_CB_DCC_CONTROL,
-                              S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
-                              S_028424_OVERWRITE_COMBINER_WATERMARK(4));
-
                vgt_tess_distribution =
                        S_028B50_ACCUM_ISOLINE(32) |
                        S_028B50_ACCUM_TRI(11) |
@@ -4695,8 +4962,8 @@ static void si_init_config(struct si_context *sctx)
                /* Testing with Unigine Heaven extreme tesselation yielded best results
                 * with TRAP_SPLIT = 3.
                 */
-               if (sctx->b.family == CHIP_FIJI ||
-                   sctx->b.family >= CHIP_POLARIS10)
+               if (sctx->family == CHIP_FIJI ||
+                   sctx->family >= CHIP_POLARIS10)
                        vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
 
                si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
@@ -4706,17 +4973,20 @@ static void si_init_config(struct si_context *sctx)
        }
 
        si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
-       if (sctx->b.chip_class >= CIK)
-               si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40);
+       if (sctx->chip_class >= CIK) {
+               si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI,
+                              S_028084_ADDRESS(border_color_va >> 40));
+       }
        si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ,
                      RADEON_PRIO_BORDER_COLORS);
 
-       if (sctx->b.chip_class >= GFX9) {
-               unsigned num_se = sscreen->b.info.max_se;
+       if (sctx->chip_class >= GFX9) {
+               unsigned num_se = sscreen->info.max_se;
                unsigned pc_lines = 0;
 
-               switch (sctx->b.family) {
+               switch (sctx->family) {
                case CHIP_VEGA10:
+               case CHIP_VEGA12:
                        pc_lines = 4096;
                        break;
                case CHIP_RAVEN:
@@ -4726,12 +4996,6 @@ static void si_init_config(struct si_context *sctx)
                        assert(0);
                }
 
-               si_pm4_set_reg(pm4, R_028060_DB_DFSM_CONTROL,
-                              S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
-               /* TODO: Enable the binner: */
-               si_pm4_set_reg(pm4, R_028C44_PA_SC_BINNER_CNTL_0,
-                              S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
-                              S_028C44_DISABLE_START_OF_PRIM(1));
                si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
                               S_028C48_MAX_ALLOC_COUNT(MIN2(128, pc_lines / (4 * num_se))) |
                               S_028C48_MAX_PRIM_PER_BATCH(1023));