radeonsi: don't flush and wait for CB after depth-only rendering
[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
index 27a88a8cef5e219dc4a3047166b44df224d3d191..b236bed306542bafe612a0b6293693e57f098dd5 100644 (file)
@@ -870,6 +870,15 @@ static void *si_create_rs_state(struct pipe_context *ctx,
                S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
                S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
 
+       if (!rs->uses_poly_offset)
+               return rs;
+
+       rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
+       if (!rs->pm4_poly_offset) {
+               FREE(rs);
+               return NULL;
+       }
+
        /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
        for (i = 0; i < 3; i++) {
                struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
@@ -965,10 +974,13 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
+       struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
 
        if (sctx->queued.named.rasterizer == state)
                si_pm4_bind_state(sctx, poly_offset, NULL);
-       si_pm4_delete_state(sctx, rasterizer, (struct si_state_rasterizer *)state);
+
+       FREE(rs->pm4_poly_offset);
+       si_pm4_delete_state(sctx, rasterizer, rs);
 }
 
 /*
@@ -2513,14 +2525,38 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
         * the only client not using TC that can change textures is
         * the framebuffer.
         *
-        * Flush all CB and DB caches here because all buffers can be used
-        * for write by both TC (with shader image stores) and CB/DB.
+        * Wait for compute shaders because of possible transitions:
+        * - FB write -> shader read
+        * - shader write -> FB read
+        *
+        * DB caches are flushed on demand (using si_decompress_textures).
+        *
+        * When MSAA is enabled, CB and TC caches are flushed on demand
+        * (after FMASK decompression). Shader write -> FB read transitions
+        * cannot happen for MSAA textures, because MSAA shader images are
+        * not supported.
+        *
+        * Only flush and wait for CB if there is actually a bound color buffer.
+        */
+       if (sctx->framebuffer.nr_samples <= 1 &&
+           sctx->framebuffer.state.nr_cbufs) {
+               sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+                                SI_CONTEXT_INV_GLOBAL_L2 |
+                                SI_CONTEXT_FLUSH_AND_INV_CB;
+       }
+       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+       /* u_blitter doesn't invoke depth decompression when it does multiple
+        * blits in a row, but the only case when it matters for DB is when
+        * doing generate_mipmap. So here we flush DB manually between
+        * individual generate_mipmap blits.
+        * Note that lower mipmap levels aren't compressed.
         */
-       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_CB |
-                        SI_CONTEXT_FLUSH_AND_INV_DB |
-                        SI_CONTEXT_CS_PARTIAL_FLUSH;
+       if (sctx->generate_mipmap_for_depth) {
+               sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+                                SI_CONTEXT_INV_GLOBAL_L2 |
+                                SI_CONTEXT_FLUSH_AND_INV_DB;
+       }
 
        /* Take the maximum of the old and new count. If the new count is lower,
         * dirtying is needed to disable the unbound colorbuffers.
@@ -3215,6 +3251,12 @@ si_make_texture_descriptor(struct si_screen *screen,
                data_format = V_008F14_IMG_DATA_FORMAT_24_8;
        }
 
+       /* S8 with Z32 HTILE needs a special format. */
+       if (screen->b.chip_class >= GFX9 &&
+           pipe_format == PIPE_FORMAT_S8_UINT &&
+           tex->tc_compatible_htile)
+               data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
+
        if (!sampler &&
            (res->target == PIPE_TEXTURE_CUBE ||
             res->target == PIPE_TEXTURE_CUBE_ARRAY ||
@@ -3704,7 +3746,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                                       const struct pipe_vertex_element *elements)
 {
        struct si_screen *sscreen = (struct si_screen*)ctx->screen;
-       struct si_vertex_element *v = CALLOC_STRUCT(si_vertex_element);
+       struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
        bool used[SI_NUM_VERTEX_BUFFERS] = {};
        int i;
 
@@ -3728,8 +3770,10 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                        return NULL;
                }
 
-               if (elements[i].instance_divisor)
+               if (elements[i].instance_divisor) {
                        v->uses_instance_divisors = true;
+                       v->instance_divisors[i] = elements[i].instance_divisor;
+               }
 
                if (!used[vbo_index]) {
                        v->first_vb_use_mask |= 1 << i;
@@ -3744,6 +3788,8 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                memcpy(swizzle, desc->swizzle, sizeof(swizzle));
 
                v->format_size[i] = desc->block.bits / 8;
+               v->src_offset[i] = elements[i].src_offset;
+               v->vertex_buffer_index[i] = vbo_index;
 
                /* The hardware always treats the 2-bit alpha channel as
                 * unsigned, so a shader workaround is needed. The affected
@@ -3836,16 +3882,14 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
                                   S_008F0C_NUM_FORMAT(num_format) |
                                   S_008F0C_DATA_FORMAT(data_format);
        }
-       memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count);
-
        return v;
 }
 
 static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
-       struct si_vertex_element *old = sctx->vertex_elements;
-       struct si_vertex_element *v = (struct si_vertex_element*)state;
+       struct si_vertex_elements *old = sctx->vertex_elements;
+       struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
        sctx->vertex_elements = v;
        sctx->vertex_buffers_dirty = true;
@@ -3930,9 +3974,12 @@ static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
-       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_CB;
+       /* Multisample surfaces are flushed in si_decompress_textures. */
+       if (sctx->framebuffer.nr_samples <= 1) {
+               sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+                                SI_CONTEXT_INV_GLOBAL_L2 |
+                                SI_CONTEXT_FLUSH_AND_INV_CB;
+       }
        sctx->framebuffer.do_update_surf_dirtiness = true;
 }
 
@@ -3970,12 +4017,18 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
                        sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
        }
 
-       if (flags & PIPE_BARRIER_FRAMEBUFFER)
+       /* MSAA color, any depth and any stencil are flushed in
+        * si_decompress_textures when needed.
+        */
+       if (flags & PIPE_BARRIER_FRAMEBUFFER &&
+           sctx->framebuffer.nr_samples <= 1) {
                sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-                                SI_CONTEXT_FLUSH_AND_INV_DB;
+                                SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+       }
 
-       if (flags & (PIPE_BARRIER_FRAMEBUFFER |
-                    PIPE_BARRIER_INDIRECT_BUFFER))
+       /* Indirect buffers use TC L2 on GFX9, but not older hw. */
+       if (sctx->screen->b.chip_class <= VI &&
+           flags & PIPE_BARRIER_INDIRECT_BUFFER)
                sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 }