sctx->render_cond_force_off = false;
}
+static void si_compute_clear_12bytes_buffer(struct si_context *sctx,
+ struct pipe_resource *dst,
+ unsigned dst_offset,
+ unsigned size,
+ const uint32_t *clear_value,
+ enum si_coherency coher)
+{
+ struct pipe_context *ctx = &sctx->b;
+
+ assert(dst_offset % 4 == 0);
+ assert(size % 4 == 0);
+ unsigned size_12 = DIV_ROUND_UP(size, 12);
+
+ unsigned data[4] = {0};
+ memcpy(data, clear_value, 12);
+
+ si_compute_internal_begin(sctx);
+
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH |
+ si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+
+ struct pipe_shader_buffer saved_sb = {0};
+ si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
+
+ unsigned saved_writable_mask = 0;
+ if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+ (1u << si_get_shaderbuf_slot(0)))
+ saved_writable_mask = 1;
+
+ struct pipe_constant_buffer saved_cb = {};
+ si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+ void *saved_cs = sctx->cs_shader_state.program;
+
+ struct pipe_constant_buffer cb = {};
+ cb.buffer_size = sizeof(data);
+ cb.user_buffer = data;
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+ struct pipe_shader_buffer sb = {0};
+ sb.buffer = dst;
+ sb.buffer_offset = dst_offset;
+ sb.buffer_size = size;
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
+
+ struct pipe_grid_info info = {0};
+
+ if (!sctx->cs_clear_12bytes_buffer)
+ sctx->cs_clear_12bytes_buffer =
+ si_clear_12bytes_buffer_shader(ctx);
+ ctx->bind_compute_state(ctx, sctx->cs_clear_12bytes_buffer);
+ info.block[0] = 64;
+ info.last_block[0] = size_12 % 64;
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(size_12, 64);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+
+ ctx->launch_grid(ctx, &info);
+
+ ctx->bind_compute_state(ctx, saved_cs);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
+ ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+ si_compute_internal_end(sctx);
+}
+
static void si_compute_do_clear_or_copy(struct si_context *sctx,
struct pipe_resource *dst,
unsigned dst_offset,
clear_value_size = 4;
}
- /* Use transform feedback for 12-byte clears. */
- /* TODO: Use compute. */
if (clear_value_size == 12) {
- union pipe_color_union streamout_clear_value;
-
- memcpy(&streamout_clear_value, clear_value, clear_value_size);
- si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
- util_blitter_clear_buffer(sctx->blitter, dst, offset,
- size, clear_value_size / 4,
- &streamout_clear_value);
- si_blitter_end(sctx);
+ si_compute_clear_12bytes_buffer(sctx, dst, offset, size, clear_value, coher);
return;
}
return ctx->create_compute_state(ctx, &state);
}
+void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx)
+{
+ static const char text[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL SV[0], THREAD_ID\n"
+ "DCL SV[1], BLOCK_ID\n"
+ "DCL BUFFER[0]\n"
+ "DCL CONST[0][0..0]\n" // 0:xyzw
+ "DCL TEMP[0..0]\n"
+ "IMM[0] UINT32 {64, 1, 12, 0}\n"
+ "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+ "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" //12 bytes
+ "STORE BUFFER[0].xyz, TEMP[0].xxxx, CONST[0][0].xyzw\n"
+ "END\n";
+
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {0};
+
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
+ return NULL;
+ }
+
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ return ctx->create_compute_state(ctx, &state);
+}
+
+
/* Load samples from the image, and copy them to the same image. This looks like
* a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
* reordered to match expanded FMASK.