radeonsi: use compute shader for clear 12-byte buffer
authorSonny Jiang <sonny.jiang@amd.com>
Fri, 29 Nov 2019 23:04:54 +0000 (18:04 -0500)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 10 Dec 2019 04:25:57 +0000 (23:25 -0500)
Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
src/gallium/drivers/radeonsi/si_compute_blit.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c

index ff573c131f4e6a7d7076787c5ffa12db675ecd76..8d4f3bab5d75055ff970e83d1536a8404dd5933d 100644 (file)
@@ -73,6 +73,76 @@ static void si_compute_internal_end(struct si_context *sctx)
        sctx->render_cond_force_off = false;
 }
 
+static void si_compute_clear_12bytes_buffer(struct si_context *sctx,
+                                       struct pipe_resource *dst,
+                                       unsigned dst_offset,
+                                       unsigned size,
+                                       const uint32_t *clear_value,
+                                       enum si_coherency coher)
+{
+       struct pipe_context *ctx = &sctx->b;
+
+       assert(dst_offset % 4 == 0);
+       assert(size % 4 == 0);
+       unsigned size_12 = DIV_ROUND_UP(size, 12);
+
+       unsigned data[4] = {0};
+       memcpy(data, clear_value, 12);
+
+       si_compute_internal_begin(sctx);
+
+       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                      SI_CONTEXT_CS_PARTIAL_FLUSH |
+                      si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+
+       struct pipe_shader_buffer saved_sb = {0};
+       si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
+
+       unsigned saved_writable_mask = 0;
+       if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+           (1u << si_get_shaderbuf_slot(0)))
+               saved_writable_mask = 1;
+
+       struct pipe_constant_buffer saved_cb = {};
+       si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+       void *saved_cs = sctx->cs_shader_state.program;
+
+       struct pipe_constant_buffer cb = {};
+       cb.buffer_size = sizeof(data);
+       cb.user_buffer = data;
+       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+       struct pipe_shader_buffer sb = {0};
+       sb.buffer = dst;
+       sb.buffer_offset = dst_offset;
+       sb.buffer_size = size;
+
+       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
+
+       struct pipe_grid_info info = {0};
+
+       if (!sctx->cs_clear_12bytes_buffer)
+               sctx->cs_clear_12bytes_buffer =
+                       si_clear_12bytes_buffer_shader(ctx);
+       ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
+       info.block[0] = 64;
+       info.last_block[0] = size_12 % 64;
+       info.block[1] = 1;
+       info.block[2] = 1;
+       info.grid[0] = DIV_ROUND_UP(size_12, 64);
+       info.grid[1] = 1;
+       info.grid[2] = 1;
+
+       ctx->launch_grid(ctx, &info);
+
+       ctx->bind_compute_state(ctx, saved_cs);
+       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
+       ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+       si_compute_internal_end(sctx);
+}
+
 static void si_compute_do_clear_or_copy(struct si_context *sctx,
                                        struct pipe_resource *dst,
                                        unsigned dst_offset,
@@ -231,17 +301,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
                clear_value_size = 4;
        }
 
-       /* Use transform feedback for 12-byte clears. */
-       /* TODO: Use compute. */
        if (clear_value_size == 12) {
-               union pipe_color_union streamout_clear_value;
-
-               memcpy(&streamout_clear_value, clear_value, clear_value_size);
-               si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
-               util_blitter_clear_buffer(sctx->blitter, dst, offset,
-                                         size, clear_value_size / 4,
-                                         &streamout_clear_value);
-               si_blitter_end(sctx);
+               si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
                return;
        }
 
index aa627279ed3ca33cc21da9bb141c8f3a5b743fda..0ac70a5fdf5c7503b857c5c00e6859444971aba5 100644 (file)
@@ -232,6 +232,8 @@ static void si_destroy_context(struct pipe_context *context)
                sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
        if (sctx->cs_clear_render_target_1d_array)
                sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
+       if (sctx->cs_clear_12bytes_buffer)
+               sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
        if (sctx->cs_dcc_retile)
                sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
 
index f313e565d5ff30c721733fd204bb52171544321a..b4e066095820a8a3ed8a59be148cf82cb682f282 100644 (file)
@@ -894,6 +894,7 @@ struct si_context {
        void                            *cs_copy_image_1d_array;
        void                            *cs_clear_render_target;
        void                            *cs_clear_render_target_1d_array;
+       void                            *cs_clear_12bytes_buffer;
        void                            *cs_dcc_retile;
        void                            *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
        struct si_screen                *screen;
@@ -1450,6 +1451,7 @@ void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
 void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
 void *si_clear_render_target_shader(struct pipe_context *ctx);
 void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
+void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx);
 void *si_create_dcc_retile_cs(struct pipe_context *ctx);
 void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
                                bool is_array);
index 0cf0cd95a8b401c0b85925da2bfef98c66794b9a..90eb39e3506ede7ed4455433a2e861a5248b1da1 100644 (file)
@@ -665,6 +665,39 @@ void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx)
        return ctx->create_compute_state(ctx, &state);
 }
 
+void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx)
+{
+       static const char text[] =
+               "COMP\n"
+               "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+               "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+               "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+               "DCL SV[0], THREAD_ID\n"
+               "DCL SV[1], BLOCK_ID\n"
+               "DCL BUFFER[0]\n"
+               "DCL CONST[0][0..0]\n" // 0:xyzw
+               "DCL TEMP[0..0]\n"
+               "IMM[0] UINT32 {64, 1, 12, 0}\n"
+               "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+               "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes
+               "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
+               "END\n";
+
+       struct tgsi_token tokens[1024];
+       struct pipe_compute_state state = {0};
+
+       if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+               assert(false);
+               return NULL;
+       }
+
+       state.ir_type = PIPE_SHADER_IR_TGSI;
+       state.prog = tokens;
+
+       return ctx->create_compute_state(ctx, &state);
+}
+
+
 /* Load samples from the image, and copy them to the same image. This looks like
  * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
  * reordered to match expanded FMASK.