{
if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
coher == SI_COHERENCY_CP)) ||
- (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
+ (sctx->chip_class >= GFX7 && coher == SI_COHERENCY_SHADER))
return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
return L2_BYPASS;
case SI_COHERENCY_CP:
return 0;
case SI_COHERENCY_SHADER:
- return SI_CONTEXT_INV_SMEM_L1 |
- SI_CONTEXT_INV_VMEM_L1 |
- (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+ return SI_CONTEXT_INV_SCACHE |
+ SI_CONTEXT_INV_VCACHE |
+ (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
case SI_COHERENCY_CB_META:
return SI_CONTEXT_FLUSH_AND_INV_CB;
}
struct pipe_shader_buffer saved_sb[2] = {};
si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+ unsigned saved_writable_mask = 0;
+ for (unsigned i = 0; i < (src ? 2 : 1); i++) {
+ if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+ (1u << si_get_shaderbuf_slot(i)))
+ saved_writable_mask |= 1 << i;
+ }
+
/* The memory accesses are coalesced, meaning that the 1st instruction writes
* the 1st contiguous block of data for the whole wave, the 2nd instruction
* writes the 2nd contiguous block of data, etc.
SI_COMPUTE_CLEAR_DW_PER_THREAD;
unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
- unsigned dwords_per_wave = dwords_per_thread * 64;
+ unsigned wave_size = sctx->screen->compute_wave_size;
+ unsigned dwords_per_wave = dwords_per_thread * wave_size;
unsigned num_dwords = size / 4;
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
struct pipe_grid_info info = {};
- info.block[0] = MIN2(64, num_instructions);
+ info.block[0] = MIN2(wave_size, num_instructions);
info.block[1] = 1;
info.block[2] = 1;
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
sb[1].buffer_offset = src_offset;
sb[1].buffer_size = size;
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1);
if (!sctx->cs_copy_buffer) {
sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
for (unsigned i = 0; i < 4; i++)
sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1);
if (!sctx->cs_clear_buffer) {
sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
+ (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
if (cache_policy != L2_BYPASS)
si_resource(dst)->TC_L2_dirty = true;
/* Restore states. */
ctx->bind_compute_state(ctx, saved_cs);
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
+ saved_writable_mask);
si_compute_internal_end(sctx);
}
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
uint64_t offset, uint64_t size, uint32_t *clear_value,
- uint32_t clear_value_size, enum si_coherency coher)
+ uint32_t clear_value_size, enum si_coherency coher,
+ bool force_cpdma)
{
if (!size)
return;
- unsigned clear_alignment = MIN2(clear_value_size, 4);
+ ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
assert(offset % clear_alignment == 0);
* about buffer placements.
*/
if (clear_value_size > 4 ||
- (clear_value_size == 4 &&
+ (!force_cpdma &&
+ clear_value_size == 4 &&
offset % 4 == 0 &&
- (size > 32*1024 || sctx->chip_class <= VI))) {
+ (size > 32*1024 || sctx->chip_class <= GFX8))) {
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
aligned_size, clear_value,
clear_value_size, coher);
int clear_value_size)
{
si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
- clear_value_size, SI_COHERENCY_SHADER);
+ clear_value_size, SI_COHERENCY_SHADER, false);
}
void si_copy_buffer(struct si_context *sctx,
ctx->launch_grid(ctx, &info);
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
+ (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
ctx->bind_compute_state(ctx, saved_cs);
ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
SI_CONTEXT_CS_PARTIAL_FLUSH |
si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
- si_emit_cache_flush(sctx);
+ sctx->emit_cache_flush(sctx);
/* Save states. */
void *saved_cs = sctx->cs_shader_state.program;
ctx->launch_grid(ctx, &info);
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
+ (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
ctx->bind_compute_state(ctx, saved_cs);
ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);