radeonsi: overhaul the vertex fetch fixup mechanism
[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
index e3b45fa6ea7dbaa98c4d0eca33057ef71be01ec1..55965bc86a1bfa5b23fdc2408f59b8c62e6cfca0 100644 (file)
@@ -256,7 +256,7 @@ static void si_emit_cb_render_state(struct si_context *sctx)
                                            sx_blend_opt_control);
        }
        if (initial_cdw != cs->current.cdw)
-               sctx->context_roll_counter++;
+               sctx->context_roll = true;
 }
 
 /*
@@ -474,10 +474,11 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 
        si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
                       S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET0(2) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET2(2) |
-                      S_028B70_ALPHA_TO_MASK_OFFSET3(2));
+                      S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
+                      S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
+                      S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
+                      S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+                      S_028B70_OFFSET_ROUND(1));
 
        if (state->alpha_to_coverage)
                blend->need_src_alpha_4bit |= 0xf;
@@ -792,7 +793,7 @@ static void si_emit_clip_regs(struct si_context *sctx)
                S_028810_CLIP_DISABLE(window_space));
 
        if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll_counter++;
+               sctx->context_roll = true;
 }
 
 /*
@@ -1014,10 +1015,8 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
        si_update_poly_offset_state(sctx);
 
        if (!old_rs ||
-           old_rs->scissor_enable != rs->scissor_enable) {
-               sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+           old_rs->scissor_enable != rs->scissor_enable)
                si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
-       }
 
        if (!old_rs ||
            old_rs->line_width != rs->line_width ||
@@ -1026,10 +1025,8 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
                si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
 
        if (!old_rs ||
-           old_rs->clip_halfz != rs->clip_halfz) {
-               sctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+           old_rs->clip_halfz != rs->clip_halfz)
                si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
-       }
 
        if (!old_rs ||
            old_rs->clip_plane_enable != rs->clip_plane_enable ||
@@ -1355,6 +1352,14 @@ void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
 
        si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
        si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+
+       st->saved_ssbo_writable_mask = 0;
+
+       for (unsigned i = 0; i < 3; i++) {
+               if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+                   (1u << si_get_shaderbuf_slot(i)))
+                       st->saved_ssbo_writable_mask |= 1 << i;
+       }
 }
 
 static void si_emit_db_render_state(struct si_context *sctx)
@@ -1446,7 +1451,7 @@ static void si_emit_db_render_state(struct si_context *sctx)
                                   SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
 
        if (initial_cdw != sctx->gfx_cs->current.cdw)
-               sctx->context_roll_counter++;
+               sctx->context_roll = true;
 }
 
 /*
@@ -2151,7 +2156,7 @@ static boolean si_is_format_supported(struct pipe_screen *screen,
        unsigned retval = 0;
 
        if (target >= PIPE_MAX_TEXTURE_TYPES) {
-               PRINT_ERR("r600: unsupported texture type %d\n", target);
+               PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
                return false;
        }
 
@@ -2806,9 +2811,11 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
         *
         * Only flush and wait for CB if there is actually a bound color buffer.
         */
-       if (sctx->framebuffer.uncompressed_cb_mask)
+       if (sctx->framebuffer.uncompressed_cb_mask) {
                si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-                                          sctx->framebuffer.CB_has_shader_readable_metadata);
+                                          sctx->framebuffer.CB_has_shader_readable_metadata,
+                                          sctx->framebuffer.all_DCC_pipe_aligned);
+       }
 
        sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 
@@ -2857,6 +2864,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
        sctx->framebuffer.any_dst_linear = false;
        sctx->framebuffer.CB_has_shader_readable_metadata = false;
        sctx->framebuffer.DB_has_shader_readable_metadata = false;
+       sctx->framebuffer.all_DCC_pipe_aligned = true;
        unsigned num_bpp64_colorbufs = 0;
 
        for (i = 0; i < state->nr_cbufs; i++) {
@@ -2907,9 +2915,14 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
                if (tex->surface.bpe >= 8)
                        num_bpp64_colorbufs++;
 
-               if (vi_dcc_enabled(tex, surf->base.u.tex.level))
+               if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
                        sctx->framebuffer.CB_has_shader_readable_metadata = true;
 
+                       if (sctx->chip_class >= GFX9 &&
+                           !tex->surface.u.gfx9.dcc.pipe_aligned)
+                               sctx->framebuffer.all_DCC_pipe_aligned = false;
+               }
+
                si_context_add_resource_size(sctx, surf->base.texture);
 
                p_atomic_inc(&tex->framebuffers_bound);
@@ -3527,7 +3540,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
                                   SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
 
        if (initial_cdw != cs->current.cdw) {
-               sctx->context_roll_counter++;
+               sctx->context_roll = true;
 
                /* GFX9: Flush DFSM when the AA mode changes. */
                if (sctx->screen->dfsm_allowed) {
@@ -3570,7 +3583,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  * @param state 256-bit descriptor; only the high 128 bits are filled in
  */
 void
-si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
+si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
                          enum pipe_format format,
                          unsigned offset, unsigned size,
                          uint32_t *state)
@@ -3613,14 +3626,11 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
         * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
         * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
         */
-       if (screen->info.chip_class >= GFX9)
-               /* When vindex == 0, LLVM sets IDXEN = 0, thus changing units
+       if (screen->info.chip_class >= GFX9 && HAVE_LLVM < 0x0800)
+               /* When vindex == 0, LLVM < 8.0 sets IDXEN = 0, thus changing units
                 * from STRIDE to bytes. This works around it by setting
                 * NUM_RECORDS to at least the size of one element, so that
                 * the first element is readable when IDXEN == 0.
-                *
-                * TODO: Fix this in LLVM, but do we need a new intrinsic where
-                *       IDXEN is enforced?
                 */
                num_records = num_records ? MAX2(num_records, stride) : 0;
        else if (screen->info.chip_class == VI)
@@ -4064,7 +4074,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
        /* Buffer resource. */
        if (texture->target == PIPE_BUFFER) {
                si_make_buffer_descriptor(sctx->screen,
-                                         r600_resource(texture),
+                                         si_resource(texture),
                                          state->format,
                                          state->u.buf.offset,
                                          state->u.buf.size,
@@ -4449,10 +4459,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
        for (i = 0; i < count; ++i) {
                const struct util_format_description *desc;
                const struct util_format_channel_description *channel;
-               unsigned data_format, num_format;
                int first_non_void;
                unsigned vbo_index = elements[i].vertex_buffer_index;
-               unsigned char swizzle[4];
 
                if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
                        FREE(v);
@@ -4479,112 +4487,144 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 
                desc = util_format_description(elements[i].src_format);
                first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
-               data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
-               num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
                channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
-               memcpy(swizzle, desc->swizzle, sizeof(swizzle));
 
                v->format_size[i] = desc->block.bits / 8;
                v->src_offset[i] = elements[i].src_offset;
                v->vertex_buffer_index[i] = vbo_index;
 
-               /* The hardware always treats the 2-bit alpha channel as
-                * unsigned, so a shader workaround is needed. The affected
-                * chips are VI and older except Stoney (GFX8.1).
-                */
-               if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10 &&
-                   sscreen->info.chip_class <= VI &&
-                   sscreen->info.family != CHIP_STONEY) {
-                       if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
-                               v->fix_fetch[i] = SI_FIX_FETCH_A2_SNORM;
-                       } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) {
-                               v->fix_fetch[i] = SI_FIX_FETCH_A2_SSCALED;
-                       } else if (num_format == V_008F0C_BUF_NUM_FORMAT_SINT) {
-                               /* This isn't actually used in OpenGL. */
-                               v->fix_fetch[i] = SI_FIX_FETCH_A2_SINT;
-                       }
-               } else if (channel && channel->type == UTIL_FORMAT_TYPE_FIXED) {
-                       if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_FIXED;
-                       else
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_FIXED;
-               } else if (channel && channel->size == 32 && !channel->pure_integer) {
-                       if (channel->type == UTIL_FORMAT_TYPE_SIGNED) {
-                               if (channel->normalized) {
-                                       if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-                                               v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_SNORM;
-                                       else
-                                               v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SNORM;
-                               } else {
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_SSCALED;
-                               }
-                       } else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) {
-                               if (channel->normalized) {
-                                       if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-                                               v->fix_fetch[i] = SI_FIX_FETCH_RGBX_32_UNORM;
-                                       else
-                                               v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_UNORM;
-                               } else {
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGBA_32_USCALED;
-                               }
-                       }
-               } else if (channel && channel->size == 64 &&
-                          channel->type == UTIL_FORMAT_TYPE_FLOAT) {
-                       switch (desc->nr_channels) {
-                       case 1:
-                       case 2:
-                               v->fix_fetch[i] = SI_FIX_FETCH_RG_64_FLOAT;
-                               swizzle[0] = PIPE_SWIZZLE_X;
-                               swizzle[1] = PIPE_SWIZZLE_Y;
-                               swizzle[2] = desc->nr_channels == 2 ? PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0;
-                               swizzle[3] = desc->nr_channels == 2 ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_0;
-                               break;
-                       case 3:
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGB_64_FLOAT;
-                               swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */
-                               swizzle[1] = PIPE_SWIZZLE_Y;
-                               swizzle[2] = PIPE_SWIZZLE_0;
-                               swizzle[3] = PIPE_SWIZZLE_0;
-                               break;
-                       case 4:
-                               v->fix_fetch[i] = SI_FIX_FETCH_RGBA_64_FLOAT;
-                               swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */
-                               swizzle[1] = PIPE_SWIZZLE_Y;
-                               swizzle[2] = PIPE_SWIZZLE_Z;
-                               swizzle[3] = PIPE_SWIZZLE_W;
-                               break;
-                       default:
-                               assert(0);
-                       }
-               } else if (channel && desc->nr_channels == 3) {
-                       assert(desc->swizzle[0] == PIPE_SWIZZLE_X);
+               bool always_fix = false;
+               union si_vs_fix_fetch fix_fetch;
+               unsigned log_hw_load_size; /* the load element size as seen by the hardware */
+
+               fix_fetch.bits = 0;
+               log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
 
-                       if (channel->size == 8) {
+               if (channel) {
+                       switch (channel->type) {
+                       case UTIL_FORMAT_TYPE_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+                       case UTIL_FORMAT_TYPE_FIXED: fix_fetch.u.format = AC_FETCH_FORMAT_FIXED; break;
+                       case UTIL_FORMAT_TYPE_SIGNED: {
                                if (channel->pure_integer)
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGB_8_INT;
+                                       fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
+                               else if (channel->normalized)
+                                       fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
                                else
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGB_8;
-                       } else if (channel->size == 16) {
+                                       fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
+                               break;
+                       }
+                       case UTIL_FORMAT_TYPE_UNSIGNED: {
                                if (channel->pure_integer)
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGB_16_INT;
+                                       fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
+                               else if (channel->normalized)
+                                       fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
                                else
-                                       v->fix_fetch[i] = SI_FIX_FETCH_RGB_16;
+                                       fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
+                               break;
+                       }
+                       default: unreachable("bad format type");
+                       }
+               } else {
+                       switch (elements[i].src_format) {
+                       case PIPE_FORMAT_R11G11B10_FLOAT: fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT; break;
+                       default: unreachable("bad other format");
+                       }
+               }
+
+               if (desc->channel[0].size == 10) {
+                       fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
+                       log_hw_load_size = 2;
+
+                       /* The hardware always treats the 2-bit alpha channel as
+                        * unsigned, so a shader workaround is needed. The affected
+                        * chips are VI and older except Stoney (GFX8.1).
+                        */
+                       always_fix = sscreen->info.chip_class <= VI &&
+                                    sscreen->info.family != CHIP_STONEY &&
+                                    channel->type == UTIL_FORMAT_TYPE_SIGNED;
+               } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
+                       fix_fetch.u.log_size = 3; /* special encoding */
+                       fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
+                       log_hw_load_size = 2;
+               } else {
+                       fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
+                       fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
+
+                       /* Always fix up:
+                        * - doubles (multiple loads + truncate to float)
+                        * - 32-bit requiring a conversion
+                        */
+                       always_fix =
+                               (fix_fetch.u.log_size == 3) ||
+                               (fix_fetch.u.log_size == 2 &&
+                                fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
+                                fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
+                                fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
+
+                       /* Also fixup 8_8_8 and 16_16_16. */
+                       if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
+                               always_fix = true;
+                               log_hw_load_size = fix_fetch.u.log_size;
                        }
                }
 
-               v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-                                  S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-                                  S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-                                  S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-                                  S_008F0C_NUM_FORMAT(num_format) |
-                                  S_008F0C_DATA_FORMAT(data_format);
+               if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
+                       assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
+                              (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
+                       fix_fetch.u.reverse = 1;
+               }
+
+               /* Force the workaround for unaligned access here already if the
+                * offset relative to the vertex buffer base is unaligned.
+                *
+                * There is a theoretical case in which this is too conservative:
+                * if the vertex buffer's offset is also unaligned in just the
+                * right way, we end up with an aligned address after all.
+                * However, this case should be extremely rare in practice (it
+                * won't happen in well-behaved applications), and taking it
+                * into account would complicate the fast path (where everything
+                * is nicely aligned).
+                */
+               bool check_alignment = log_hw_load_size >= 1 && sscreen->info.chip_class == SI;
+               bool opencode = sscreen->options.vs_fetch_always_opencode;
+
+               if (check_alignment &&
+                   (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
+                       opencode = true;
+
+               if (always_fix || check_alignment || opencode)
+                       v->fix_fetch[i] = fix_fetch.bits;
+
+               if (opencode)
+                       v->fix_fetch_opencode |= 1 << i;
+               if (opencode || always_fix)
+                       v->fix_fetch_always |= 1 << i;
+
+               if (check_alignment && !opencode) {
+                       assert(log_hw_load_size == 1 || log_hw_load_size == 2);
+
+                       v->fix_fetch_unaligned |= 1 << i;
+                       v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
+                       v->vb_alignment_check_mask |= 1 << vbo_index;
+               }
+
+               v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+                                  S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+                                  S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+                                  S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
+
+               unsigned data_format, num_format;
+               data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
+               num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+               v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) |
+                                   S_008F0C_DATA_FORMAT(data_format);
        }
 
        if (v->instance_divisor_is_fetched) {
                unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
 
                v->instance_divisor_factor_buffer =
-                       (struct r600_resource*)
+                       (struct si_resource*)
                        pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
                                           num_divisors * sizeof(divisor_factors[0]));
                if (!v->instance_divisor_factor_buffer) {
@@ -4611,7 +4651,17 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
            (!old ||
             old->count != v->count ||
             old->uses_instance_divisors != v->uses_instance_divisors ||
-            v->uses_instance_divisors || /* we don't check which divisors changed */
+            /* we don't check which divisors changed */
+            v->uses_instance_divisors ||
+            (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) & sctx->vertex_buffer_unaligned ||
+            ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
+             memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
+                    sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+            /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
+             * functions of fix_fetch and the src_offset alignment.
+             * If they change and fix_fetch doesn't, it must be due to different
+             * src_offset alignment, which is reflected in fix_fetch_opencode. */
+            old->fix_fetch_opencode != v->fix_fetch_opencode ||
             memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
                sctx->do_update_shaders = true;
 
@@ -4633,7 +4683,7 @@ static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
 
        if (sctx->vertex_elements == state)
                sctx->vertex_elements = NULL;
-       r600_resource_reference(&v->instance_divisor_factor_buffer, NULL);
+       si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
        FREE(state);
 }
 
@@ -4643,6 +4693,8 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
+       uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
+       uint32_t unaligned = orig_unaligned;
        int i;
 
        assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
@@ -4656,16 +4708,35 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
                        pipe_resource_reference(&dsti->buffer.resource, buf);
                        dsti->buffer_offset = src->buffer_offset;
                        dsti->stride = src->stride;
+                       if (dsti->buffer_offset & 3 || dsti->stride & 3)
+                               unaligned |= 1 << (start_slot + i);
+                       else
+                               unaligned &= ~(1 << (start_slot + i));
+
                        si_context_add_resource_size(sctx, buf);
                        if (buf)
-                               r600_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+                               si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
                }
        } else {
                for (i = 0; i < count; i++) {
                        pipe_resource_reference(&dst[i].buffer.resource, NULL);
                }
+               unaligned &= ~u_bit_consecutive(start_slot, count);
        }
        sctx->vertex_buffers_dirty = true;
+       sctx->vertex_buffer_unaligned = unaligned;
+
+       /* Check whether alignment may have changed in a way that requires
+        * shader changes. This check is conservative: a vertex buffer can only
+        * trigger a shader change if the misalignment amount changes (e.g.
+        * from byte-aligned to short-aligned), but we only keep track of
+        * whether buffers are at least dword-aligned, since that should always
+        * be the case in well-behaved applications anyway.
+        */
+       if (sctx->vertex_elements &&
+           (sctx->vertex_elements->vb_alignment_check_mask &
+            (unaligned | orig_unaligned) & u_bit_consecutive(start_slot, count)))
+               sctx->do_update_shaders = true;
 }
 
 /*
@@ -4687,7 +4758,7 @@ static void si_set_tess_state(struct pipe_context *ctx,
        cb.user_buffer = NULL;
        cb.buffer_size = sizeof(array);
 
-       si_upload_const_buffer(sctx, (struct r600_resource**)&cb.buffer,
+       si_upload_const_buffer(sctx, (struct si_resource**)&cb.buffer,
                               (void*)array, sizeof(array),
                               &cb.buffer_offset);
 
@@ -4702,9 +4773,11 @@ static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
        si_update_fb_dirtiness_after_rendering(sctx);
 
        /* Multisample surfaces are flushed in si_decompress_textures. */
-       if (sctx->framebuffer.uncompressed_cb_mask)
+       if (sctx->framebuffer.uncompressed_cb_mask) {
                si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
-                                          sctx->framebuffer.CB_has_shader_readable_metadata);
+                                          sctx->framebuffer.CB_has_shader_readable_metadata,
+                                          sctx->framebuffer.all_DCC_pipe_aligned);
+       }
 }
 
 /* This only ensures coherency for shader image/buffer stores. */
@@ -4712,6 +4785,9 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
+       if (!(flags & ~PIPE_BARRIER_UPDATE))
+               return;
+
        /* Subsequent commands must wait for all shader invocations to
         * complete. */
        sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
@@ -4770,6 +4846,15 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 
 static void si_init_config(struct si_context *sctx);
 
+void si_init_state_compute_functions(struct si_context *sctx)
+{
+       sctx->b.create_sampler_state = si_create_sampler_state;
+       sctx->b.delete_sampler_state = si_delete_sampler_state;
+       sctx->b.create_sampler_view = si_create_sampler_view;
+       sctx->b.sampler_view_destroy = si_sampler_view_destroy;
+       sctx->b.memory_barrier = si_memory_barrier;
+}
+
 void si_init_state_functions(struct si_context *sctx)
 {
        sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
@@ -4808,12 +4893,6 @@ void si_init_state_functions(struct si_context *sctx)
 
        sctx->b.set_framebuffer_state = si_set_framebuffer_state;
 
-       sctx->b.create_sampler_state = si_create_sampler_state;
-       sctx->b.delete_sampler_state = si_delete_sampler_state;
-
-       sctx->b.create_sampler_view = si_create_sampler_view;
-       sctx->b.sampler_view_destroy = si_sampler_view_destroy;
-
        sctx->b.set_sample_mask = si_set_sample_mask;
 
        sctx->b.create_vertex_elements_state = si_create_vertex_elements;
@@ -4822,14 +4901,11 @@ void si_init_state_functions(struct si_context *sctx)
        sctx->b.set_vertex_buffers = si_set_vertex_buffers;
 
        sctx->b.texture_barrier = si_texture_barrier;
-       sctx->b.memory_barrier = si_memory_barrier;
        sctx->b.set_min_samples = si_set_min_samples;
        sctx->b.set_tess_state = si_set_tess_state;
 
        sctx->b.set_active_query_state = si_set_active_query_state;
 
-       sctx->b.draw_vbo = si_draw_vbo;
-
        si_init_config(sctx);
 }