r600/eg: rework atomic counter emission with flushes
authorDave Airlie <airlied@redhat.com>
Tue, 7 Aug 2018 00:41:20 +0000 (01:41 +0100)
committerDave Airlie <airlied@redhat.com>
Tue, 21 Aug 2018 19:45:38 +0000 (20:45 +0100)
With the current code, we didn't do the space checks prior
to atomic counter setup emission, but we also didn't add
atomic counters to the space check so we could get a flush
later as well.

These flushes would be bad, and lead to problems with
parallel tests. We have to ensure the atomic counter copy in,
draw emits and counter copy out are kept in the same command
submission unit.

This reworks the code to drop some useless masks, make the
counting separate to the emits, and make the space checker
handle atomic counter space.

[airlied: want this in 18.2]

Fixes: 06993e4ee (r600: add support for hw atomic counters. (v3))
src/gallium/drivers/r600/evergreen_compute.c
src/gallium/drivers/r600/evergreen_hw_context.c
src/gallium/drivers/r600/evergreen_state.c
src/gallium/drivers/r600/r600_hw_context.c
src/gallium/drivers/r600/r600_pipe.h
src/gallium/drivers/r600/r600_state_common.c

index 90eae1e2829618457bfcf5edfc232466fcf352db..a77f58242e35d506524e494e4c06ec8f2fab909d 100644 (file)
@@ -715,7 +715,6 @@ static void compute_emit_cs(struct r600_context *rctx,
                rctx->cmd_buf_is_compute = true;
        }
 
-       r600_need_cs_space(rctx, 0, true);
        if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI) {
                r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty);
                current = rctx->cs_shader_state.shader->sel->current;
@@ -742,16 +741,22 @@ static void compute_emit_cs(struct r600_context *rctx,
                }
                rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
                rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
+
+               evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
+               r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
+
                if (need_buf_const) {
                        eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
                }
                r600_update_driver_const_buffers(rctx, true);
 
-               if (evergreen_emit_atomic_buffer_setup(rctx, current, combined_atomics, &atomic_used_mask)) {
+               evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
+               if (atomic_used_mask) {
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
                }
-       }
+       } else
+               r600_need_cs_space(rctx, 0, true, 0);
 
        /* Initialize all the compute-related registers.
         *
index d3f3e227c1f6406c5641cdf163383cba7183adc5..5e0e27b0f16f9dfde4b28c259382265219dab413 100644 (file)
@@ -109,7 +109,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 
                r600_need_cs_space(rctx,
                                   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
-                                  R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
+                                  R600_MAX_PFP_SYNC_ME_DWORDS, FALSE, 0);
 
                /* Flush the caches for the first copy only. */
                if (rctx->b.flags) {
index 57e81e30c271939bf5e42ef98d3adf7017754e82..cc41e1143693ee3f489b3fff13363eac15727f08 100644 (file)
@@ -4030,7 +4030,6 @@ static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx,
 
                if (!buffers || !buffers[idx].buffer) {
                        pipe_resource_reference(&abuf->buffer, NULL);
-                       astate->enabled_mask &= ~(1 << i);
                        continue;
                }
                buf = &buffers[idx];
@@ -4038,7 +4037,6 @@ static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx,
                pipe_resource_reference(&abuf->buffer, buf->buffer);
                abuf->buffer_offset = buf->buffer_offset;
                abuf->buffer_size = buf->buffer_size;
-               astate->enabled_mask |= (1 << i);
        }
 }
 
@@ -4868,20 +4866,15 @@ static void cayman_write_count_to_gds(struct r600_context *rctx,
        radeon_emit(cs, reloc);
 }
 
-bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
-                                       struct r600_pipe_shader *cs_shader,
-                                       struct r600_shader_atomic *combined_atomics,
-                                       uint8_t *atomic_used_mask_p)
+void evergreen_emit_atomic_buffer_setup_count(struct r600_context *rctx,
+                                             struct r600_pipe_shader *cs_shader,
+                                             struct r600_shader_atomic *combined_atomics,
+                                             uint8_t *atomic_used_mask_p)
 {
-       struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
-       unsigned pkt_flags = 0;
        uint8_t atomic_used_mask = 0;
        int i, j, k;
        bool is_compute = cs_shader ? true : false;
 
-       if (is_compute)
-               pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
-
        for (i = 0; i < (is_compute ? 1 : EG_NUM_HW_STAGES); i++) {
                uint8_t num_atomic_stage;
                struct r600_pipe_shader *pshader;
@@ -4914,8 +4907,25 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
                        }
                }
        }
+       *atomic_used_mask_p = atomic_used_mask;
+}
+
+void evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
+                                       bool is_compute,
+                                       struct r600_shader_atomic *combined_atomics,
+                                       uint8_t atomic_used_mask)
+{
+       struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
+       unsigned pkt_flags = 0;
+       uint32_t mask;
+
+       if (is_compute)
+               pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
+
+       mask = atomic_used_mask;
+       if (!mask)
+               return;
 
-       uint32_t mask = atomic_used_mask;
        while (mask) {
                unsigned atomic_index = u_bit_scan(&mask);
                struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
@@ -4927,8 +4937,6 @@ bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
                else
                        evergreen_emit_set_append_cnt(rctx, atomic, resource, pkt_flags);
        }
-       *atomic_used_mask_p = atomic_used_mask;
-       return true;
 }
 
 void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
@@ -4940,7 +4948,7 @@ void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
        struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
        uint32_t pkt_flags = 0;
        uint32_t event = EVENT_TYPE_PS_DONE;
-       uint32_t mask = astate->enabled_mask;
+       uint32_t mask;
        uint64_t dst_offset;
        unsigned reloc;
 
index 1cfc180ad6ca9629f8c50a2f8b098240d878e4d9..a2f5f637b20cf4f4153cc91bd42638aabf97ea82 100644 (file)
@@ -31,7 +31,7 @@
 
 
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
-                       boolean count_draw_in)
+                       boolean count_draw_in, unsigned num_atomics)
 {
        /* Flush the DMA IB if it's not empty. */
        if (radeon_emitted(ctx->b.dma.cs, 0))
@@ -61,6 +61,9 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
                num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS;
        }
 
+       /* add atomic counters, 8 pre + 8 post per counter + 16 post if any counters */
+       num_dw += (num_atomics * 16) + (num_atomics ? 16 : 0);
+
        /* Count in r600_suspend_queries. */
        num_dw += ctx->b.num_cs_dw_queries_suspend;
 
@@ -526,7 +529,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 
                r600_need_cs_space(rctx,
                                   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
-                                  3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
+                                  3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE, 0);
 
                /* Flush the caches for the first copy only. */
                if (rctx->b.flags) {
index 6204e3c557b8ea34b3ef2f25f8dcc3f8aacd32dd..239005cab7ff205e420c63f7eedc92e659054941 100644 (file)
@@ -446,8 +446,6 @@ struct r600_shader_state {
 };
 
 struct r600_atomic_buffer_state {
-       uint32_t enabled_mask;
-       uint32_t dirty_mask;
        struct pipe_shader_buffer buffer[EG_MAX_ATOMIC_BUFFERS];
 };
 
@@ -773,7 +771,7 @@ void r600_context_gfx_flush(void *context, unsigned flags,
                            struct pipe_fence_handle **fence);
 void r600_begin_new_cs(struct r600_context *ctx);
 void r600_flush_emit(struct r600_context *ctx);
-void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in, unsigned num_atomics);
 void r600_emit_pfp_sync_me(struct r600_context *rctx);
 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
                             struct pipe_resource *dst, uint64_t dst_offset,
@@ -1067,10 +1065,14 @@ void r600_delete_shader_selector(struct pipe_context *ctx,
                                 struct r600_pipe_shader_selector *sel);
 
 struct r600_shader_atomic;
-bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
-                                       struct r600_pipe_shader *cs_shader,
+void evergreen_emit_atomic_buffer_setup_count(struct r600_context *rctx,
+                                             struct r600_pipe_shader *cs_shader,
+                                             struct r600_shader_atomic *combined_atomics,
+                                             uint8_t *atomic_used_mask_p);
+void evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
+                                       bool is_compute,
                                        struct r600_shader_atomic *combined_atomics,
-                                       uint8_t *atomic_used_mask_p);
+                                       uint8_t atomic_used_mask);
 void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
                                       bool is_compute,
                                       struct r600_shader_atomic *combined_atomics,
index 402d95838f027a2380fb8555fe857051d9f40793..e6c1b0be97ccb2f5960b1fa90c7874be816e2444 100644 (file)
@@ -2085,8 +2085,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
                : (rctx->tes_shader)? rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]
                : info->mode;
 
-       if (rctx->b.chip_class >= EVERGREEN)
-               evergreen_emit_atomic_buffer_setup(rctx, NULL, combined_atomics, &atomic_used_mask);
+       if (rctx->b.chip_class >= EVERGREEN) {
+               evergreen_emit_atomic_buffer_setup_count(rctx, NULL, combined_atomics, &atomic_used_mask);
+       }
 
        if (index_size) {
                index_offset += info->start * index_size;
@@ -2172,7 +2173,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
                evergreen_setup_tess_constants(rctx, info, &num_patches);
 
        /* Emit states. */
-       r600_need_cs_space(rctx, has_user_indices ? 5 : 0, TRUE);
+       r600_need_cs_space(rctx, has_user_indices ? 5 : 0, TRUE, util_bitcount(atomic_used_mask));
        r600_flush_emit(rctx);
 
        mask = rctx->dirty_atoms;
@@ -2180,6 +2181,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
                r600_emit_atom(rctx, rctx->atoms[u_bit_scan64(&mask)]);
        }
 
+       if (rctx->b.chip_class >= EVERGREEN) {
+               evergreen_emit_atomic_buffer_setup(rctx, false, combined_atomics, atomic_used_mask);
+       }
+               
        if (rctx->b.chip_class == CAYMAN) {
                /* Copied from radeonsi. */
                unsigned primgroup_size = 128; /* recommended without a GS */
@@ -3284,7 +3289,7 @@ static void r600_set_active_query_state(struct pipe_context *ctx, boolean enable
 static void r600_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
                                    bool include_draw_vbo)
 {
-       r600_need_cs_space((struct r600_context*)ctx, num_dw, include_draw_vbo);
+       r600_need_cs_space((struct r600_context*)ctx, num_dw, include_draw_vbo, 0);
 }
 
 /* keep this at the end of this file, please */