From 113278ee79a6366ad88a4f584aa1c0310d71b479 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 19 Aug 2017 18:56:36 +0200 Subject: [PATCH] radeonsi: remove Constant Engine support MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We have come to the conclusion that it doesn't improve performance. Tested-by: Dieter Nützel Reviewed-by: Samuel Pitoiset Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeon/r600_gpu_load.c | 4 - src/gallium/drivers/radeon/r600_pipe_common.c | 2 - src/gallium/drivers/radeon/r600_pipe_common.h | 5 +- src/gallium/drivers/radeon/r600_query.c | 3 - src/gallium/drivers/radeon/r600_query.h | 1 - src/gallium/drivers/radeon/radeon_winsys.h | 33 +- src/gallium/drivers/radeonsi/si_compute.c | 4 - src/gallium/drivers/radeonsi/si_debug.c | 32 +- src/gallium/drivers/radeonsi/si_descriptors.c | 295 ++---------------- src/gallium/drivers/radeonsi/si_hw_context.c | 51 +-- src/gallium/drivers/radeonsi/si_pipe.c | 43 --- src/gallium/drivers/radeonsi/si_pipe.h | 10 - src/gallium/drivers/radeonsi/si_state.h | 24 +- src/gallium/drivers/radeonsi/si_state_draw.c | 39 --- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 98 ------ src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 12 +- 16 files changed, 38 insertions(+), 618 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_gpu_load.c b/src/gallium/drivers/radeon/r600_gpu_load.c index 4e9f7ea37cd..d35be4f327a 100644 --- a/src/gallium/drivers/radeon/r600_gpu_load.c +++ b/src/gallium/drivers/radeon/r600_gpu_load.c @@ -68,7 +68,6 @@ #define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1) #define DMA_BUSY(x) (((x) >> 22) & 0x1) #define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1) -#define CE_BUSY(x) (((x) >> 26) & 0x1) #define IDENTITY(x) x @@ -123,7 +122,6 @@ static void r600_update_mmio_counters(struct r600_common_screen *rscreen, UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY); UPDATE_COUNTER(cp_dma, DMA_BUSY); UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY); - UPDATE_COUNTER(ce, CE_BUSY); } value = gui_busy || sdma_busy; @@ -266,8 +264,6 @@ static unsigned busy_index_from_type(struct r600_common_screen *rscreen, return BUSY_INDEX(rscreen, cp_dma); case R600_QUERY_GPU_SCRATCH_RAM_BUSY: return BUSY_INDEX(rscreen, scratch_ram); - case R600_QUERY_GPU_CE_BUSY: - return BUSY_INDEX(rscreen, ce); default: unreachable("invalid query type"); } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index dc54b5e5b79..37c12dea374 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -815,8 +815,6 @@ static const struct debug_named_value common_debug_options[] = { { "norbplus", DBG_NO_RB_PLUS, "Disable RB+." }, { "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." }, { "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" }, - { "ce", DBG_CE, "Force enable the constant engine" }, - { "noce", DBG_NO_CE, "Disable the constant engine"}, { "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader optimizations" }, { "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" }, diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index abe9c4ca57b..c10cf18219e 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -72,7 +72,7 @@ struct u_log_context; #define DBG_NIR (1 << 1) #define DBG_COMPUTE (1 << 2) #define DBG_VM (1 << 3) -#define DBG_CE (1 << 4) +/* gap */ /* shader logging */ #define DBG_FS (1 << 5) #define DBG_VS (1 << 6) @@ -108,7 +108,7 @@ struct u_log_context; #define DBG_NO_RB_PLUS (1ull << 45) #define DBG_SI_SCHED (1ull << 46) #define DBG_MONOLITHIC_SHADERS (1ull << 47) -#define DBG_NO_CE (1ull << 48) +/* gap */ #define DBG_UNSAFE_MATH (1ull << 49) #define DBG_NO_DCC_FB (1ull << 50) #define DBG_TEST_VMFAULT_CP (1ull << 51) @@ -375,7 +375,6 @@ union r600_mmio_counters { struct r600_mmio_counter surf_sync; struct r600_mmio_counter cp_dma; struct r600_mmio_counter scratch_ram; - struct r600_mmio_counter ce; } named; unsigned array[0]; }; diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index bccfe7f94f8..98bdd80d739 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -219,7 +219,6 @@ static bool r600_query_sw_begin(struct r600_common_context *rctx, case R600_QUERY_GPU_SURF_SYNC_BUSY: case R600_QUERY_GPU_CP_DMA_BUSY: case R600_QUERY_GPU_SCRATCH_RAM_BUSY: - case R600_QUERY_GPU_CE_BUSY: query->begin_result = r600_begin_counter(rctx->screen, query->b.type); break; @@ -376,7 +375,6 @@ static bool r600_query_sw_end(struct r600_common_context *rctx, case R600_QUERY_GPU_SURF_SYNC_BUSY: case R600_QUERY_GPU_CP_DMA_BUSY: case R600_QUERY_GPU_SCRATCH_RAM_BUSY: - case R600_QUERY_GPU_CE_BUSY: query->end_result = r600_end_counter(rctx->screen, query->b.type, query->begin_result); @@ -2075,7 +2073,6 @@ static struct pipe_driver_query_info r600_driver_query_list[] = { X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE), X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE), X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE), - X("GPU-ce-busy", GPU_CE_BUSY, UINT64, AVERAGE), }; #undef X diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h index 815dc7fc3c2..7455c8e63a8 100644 --- a/src/gallium/drivers/radeon/r600_query.h +++ b/src/gallium/drivers/radeon/r600_query.h @@ -102,7 +102,6 @@ enum { R600_QUERY_GPU_SURF_SYNC_BUSY, R600_QUERY_GPU_CP_DMA_BUSY, R600_QUERY_GPU_SCRATCH_RAM_BUSY, - R600_QUERY_GPU_CE_BUSY, R600_QUERY_NUM_COMPILATIONS, R600_QUERY_NUM_SHADERS_CREATED, R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO, diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 351edcd76f9..b00b1443e74 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -173,8 +173,7 @@ struct radeon_winsys_cs { unsigned max_prev; /* Space in array pointed to by prev. */ unsigned prev_dw; /* Total number of dwords in previous chunks. */ - /* Memory usage of the buffer list. These are always 0 for CE and preamble - * IBs. */ + /* Memory usage of the buffer list. These are always 0 for preamble IBs. */ uint64_t used_vram; uint64_t used_gart; }; @@ -456,36 +455,6 @@ struct radeon_winsys { struct pipe_fence_handle **fence), void *flush_ctx); - /** - * Add a constant engine IB to a graphics CS. This makes the graphics CS - * from "cs_create" a group of two IBs that share a buffer list and are - * flushed together. - * - * The returned constant CS is only a stream for writing packets to the new - * IB. Calling other winsys functions with it is not allowed, not even - * "cs_destroy". - * - * In order to add buffers and check memory usage, use the graphics CS. - * In order to flush it, use the graphics CS, which will flush both IBs. - * Destroying the graphics CS will destroy both of them. - * - * \param cs The graphics CS from "cs_create" that will hold the buffer - * list and will be used for flushing. - */ - struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs); - - /** - * Add a constant engine preamble IB to a graphics CS. This add an extra IB - * in similar manner to cs_add_const_ib. This should always be called after - * cs_add_const_ib. - * - * The returned IB is a constant engine IB that only gets flushed if the - * context changed. - * - * \param cs The graphics CS from "cs_create" that will hold the buffer - * list and will be used for flushing. - */ - struct radeon_winsys_cs *(*cs_add_const_preamble_ib)(struct radeon_winsys_cs *cs); /** * Destroy a command stream. * diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index d0e481a3f15..3ebd22c3c16 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -844,12 +844,8 @@ static void si_launch_grid( if (program->ir_type == PIPE_SHADER_IR_TGSI) si_setup_tgsi_grid(sctx, info); - si_ce_pre_draw_synchronization(sctx); - si_emit_dispatch_packets(sctx, info); - si_ce_post_draw_synchronization(sctx); - if (unlikely(sctx->current_saved_cs)) si_trace_emit(sctx); diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index b6bddc52160..c2242a6deab 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -274,7 +274,6 @@ struct si_log_chunk_cs { struct si_saved_cs *cs; bool dump_bo_list; unsigned gfx_begin, gfx_end; - unsigned ce_begin, ce_end; }; static void si_log_chunk_type_cs_destroy(void *data) @@ -331,7 +330,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) struct si_context *ctx = chunk->ctx; struct si_saved_cs *scs = chunk->cs; int last_trace_id = -1; - int last_ce_trace_id = -1; /* We are expecting that the ddebug pipe has already * waited for the context, so this buffer should be idle. @@ -341,10 +339,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) NULL, PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ); - if (map) { + if (map) last_trace_id = map[0]; - last_ce_trace_id = map[1]; - } if (chunk->gfx_end != chunk->gfx_begin) { if (chunk->gfx_begin == 0) { @@ -372,21 +368,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) } } - if (chunk->ce_end != chunk->ce_begin) { - assert(ctx->ce_ib); - - if (scs->flushed) { - ac_parse_ib(f, scs->ce.ib + chunk->ce_begin, - chunk->ce_end - chunk->ce_begin, - last_ce_trace_id, "CE IB", ctx->b.chip_class, - NULL, NULL); - } else { - si_parse_current_ib(f, ctx->ce_ib, chunk->ce_begin, - chunk->ce_end, last_ce_trace_id, "CE IB", - ctx->b.chip_class); - } - } - if (chunk->dump_bo_list) { fprintf(f, "Flushing.\n\n"); si_dump_bo_list(ctx, &scs->gfx, f); @@ -405,14 +386,9 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, struct si_saved_cs *scs = ctx->current_saved_cs; unsigned gfx_cur = ctx->b.gfx.cs->prev_dw + ctx->b.gfx.cs->current.cdw; - unsigned ce_cur = 0; - - if (ctx->ce_ib) - ce_cur = ctx->ce_ib->prev_dw + ctx->ce_ib->current.cdw; if (!dump_bo_list && - gfx_cur == scs->gfx_last_dw && - ce_cur == scs->ce_last_dw) + gfx_cur == scs->gfx_last_dw) return; struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); @@ -425,10 +401,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, chunk->gfx_end = gfx_cur; scs->gfx_last_dw = gfx_cur; - chunk->ce_begin = scs->ce_last_dw; - chunk->ce_end = ce_cur; - scs->ce_last_dw = ce_cur; - u_log_chunk(log, &si_log_chunk_type_cs, chunk); } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index ab399a5fb0d..646a9ec2570 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -97,11 +97,6 @@ static uint32_t null_image_descriptor[8] = { * descriptor */ }; -static uint16_t si_ce_ram_size(struct si_context *sctx) -{ - return sctx->b.chip_class >= GFX9 ? 4096 : 32768; -} - static void si_init_descriptor_list(uint32_t *desc_list, unsigned element_dw_size, unsigned num_elements, @@ -117,32 +112,15 @@ static void si_init_descriptor_list(uint32_t *desc_list, } } -static void si_init_descriptors(struct si_context *sctx, - struct si_descriptors *desc, +static void si_init_descriptors(struct si_descriptors *desc, unsigned shader_userdata_index, unsigned element_dw_size, - unsigned num_elements, - unsigned first_ce_slot, - unsigned num_ce_slots, - unsigned *ce_offset) + unsigned num_elements) { desc->list = CALLOC(num_elements, element_dw_size * 4); desc->element_dw_size = element_dw_size; desc->num_elements = num_elements; - desc->first_ce_slot = sctx->ce_ib ? first_ce_slot : 0; - desc->num_ce_slots = sctx->ce_ib ? num_ce_slots : 0; - desc->dirty_mask = 0; desc->shader_userdata_offset = shader_userdata_index * 4; - - if (desc->num_ce_slots) { - assert(num_elements <= sizeof(desc->dirty_mask)*8); - - desc->uses_ce = true; - desc->ce_offset = *ce_offset; - desc->dirty_mask = u_bit_consecutive64(0, num_elements); - - *ce_offset += element_dw_size * desc->num_ce_slots * 4; - } } static void si_release_descriptors(struct si_descriptors *desc) @@ -151,80 +129,6 @@ static void si_release_descriptors(struct si_descriptors *desc) FREE(desc->list); } -static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, - unsigned *out_offset, struct r600_resource **out_buf) -{ - uint64_t va; - unsigned cache_line_size = sctx->screen->b.info.tcc_cache_line_size; - - /* The base and size should be aligned to the L2 cache line size - * for optimal performance. (all dumps should rewrite whole lines) - */ - size = align(size, cache_line_size); - - (void)si_ce_ram_size; /* silence an "unused" warning */ - assert(ce_offset + size <= si_ce_ram_size(sctx)); - - u_suballocator_alloc(sctx->ce_suballocator, size, cache_line_size, - out_offset, (struct pipe_resource**)out_buf); - if (!out_buf) - return false; - - va = (*out_buf)->gpu_address + *out_offset; - - radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); - radeon_emit(sctx->ce_ib, ce_offset); - radeon_emit(sctx->ce_ib, size / 4); - radeon_emit(sctx->ce_ib, va); - radeon_emit(sctx->ce_ib, va >> 32); - - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf, - RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); - - sctx->ce_need_synchronization = true; - return true; -} - -void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx) -{ - bool success = si_ce_upload(sctx, 0, sctx->total_ce_ram_allocated, - &sctx->ce_ram_saved_offset, - &sctx->ce_ram_saved_buffer); - (void)success; - assert(success); -} - -void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx) -{ - if (!sctx->ce_ram_saved_buffer) - return; - - struct radeon_winsys_cs *ib = sctx->ce_preamble_ib; - if (!ib) - ib = sctx->ce_ib; - - uint64_t va = sctx->ce_ram_saved_buffer->gpu_address + - sctx->ce_ram_saved_offset; - - radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0)); - radeon_emit(ib, va); - radeon_emit(ib, va >> 32); - radeon_emit(ib, sctx->total_ce_ram_allocated / 4); - radeon_emit(ib, 0); - - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - sctx->ce_ram_saved_buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); -} - -void si_ce_enable_loads(struct radeon_winsys_cs *ib) -{ - radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); - radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) | - CONTEXT_CONTROL_LOAD_CE_RAM(1)); - radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1)); -} - static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc, struct r600_atom * atom) @@ -240,56 +144,25 @@ static bool si_upload_descriptors(struct si_context *sctx, if (!upload_size) return true; - if (desc->uses_ce) { - const uint32_t *list = desc->list + - desc->first_ce_slot * desc->element_dw_size; - uint64_t mask = (desc->dirty_mask >> desc->first_ce_slot) & - u_bit_consecutive64(0, desc->num_ce_slots); - - - while (mask) { - int begin, count; - u_bit_scan_consecutive_range64(&mask, &begin, &count); - - begin *= desc->element_dw_size; - count *= desc->element_dw_size; - - radeon_emit(sctx->ce_ib, - PKT3(PKT3_WRITE_CONST_RAM, count, 0)); - radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4); - radeon_emit_array(sctx->ce_ib, list + begin, count); - } - - if (!si_ce_upload(sctx, - desc->ce_offset + - (first_slot_offset - desc->first_ce_slot * slot_size), - upload_size, (unsigned*)&desc->buffer_offset, - &desc->buffer)) - return false; - } else { - uint32_t *ptr; - - u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, - si_optimal_tcc_alignment(sctx, upload_size), - (unsigned*)&desc->buffer_offset, - (struct pipe_resource**)&desc->buffer, - (void**)&ptr); - if (!desc->buffer) - return false; /* skip the draw call */ + uint32_t *ptr; + u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, + si_optimal_tcc_alignment(sctx, upload_size), + (unsigned*)&desc->buffer_offset, + (struct pipe_resource**)&desc->buffer, + (void**)&ptr); + if (!desc->buffer) + return false; /* skip the draw call */ - util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, - upload_size); - desc->gpu_list = ptr - first_slot_offset / 4; + util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, + upload_size); + desc->gpu_list = ptr - first_slot_offset / 4; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, - RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); - } + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, + RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); /* The shader pointer should point to slot 0. */ desc->buffer_offset -= first_slot_offset; - desc->dirty_mask = 0; - if (atom) si_mark_atom_dirty(sctx, atom); @@ -598,7 +471,6 @@ static void si_set_sampler_view(struct si_context *sctx, views->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1ull << desc_slot; sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } @@ -750,8 +622,6 @@ si_disable_shader_image(struct si_context *ctx, unsigned shader, unsigned slot) memcpy(descs->list + desc_slot*8, null_image_descriptor, 8*4); images->enabled_mask &= ~(1u << slot); - /* two 8-byte images share one 16-byte slot */ - descs->dirty_mask |= 1u << (desc_slot / 2); ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } } @@ -887,8 +757,6 @@ static void si_set_shader_image(struct si_context *ctx, } images->enabled_mask |= 1u << slot; - /* two 8-byte images share one 16-byte slot */ - descs->dirty_mask |= 1u << (desc_slot / 2); ctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); /* Since this can flush, it must be done after enabled_mask is updated. */ @@ -984,25 +852,20 @@ static void si_bind_sampler_states(struct pipe_context *ctx, continue; memcpy(desc->list + desc_slot * 16 + 12, sstates[i]->val, 4*4); - desc->dirty_mask |= 1ull << desc_slot; sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } } /* BUFFER RESOURCES */ -static void si_init_buffer_resources(struct si_context *sctx, - struct si_buffer_resources *buffers, +static void si_init_buffer_resources(struct si_buffer_resources *buffers, struct si_descriptors *descs, unsigned num_buffers, - unsigned first_ce_slot, - unsigned num_ce_slots, unsigned shader_userdata_index, enum radeon_bo_usage shader_usage, enum radeon_bo_usage shader_usage_constbuf, enum radeon_bo_priority priority, - enum radeon_bo_priority priority_constbuf, - unsigned *ce_offset) + enum radeon_bo_priority priority_constbuf) { buffers->shader_usage = shader_usage; buffers->shader_usage_constbuf = shader_usage_constbuf; @@ -1010,8 +873,7 @@ static void si_init_buffer_resources(struct si_context *sctx, buffers->priority_constbuf = priority_constbuf; buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); - si_init_descriptors(sctx, descs, shader_userdata_index, 4, num_buffers, - first_ce_slot, num_ce_slots, ce_offset); + si_init_descriptors(descs, shader_userdata_index, 4, num_buffers); } static void si_release_buffer_resources(struct si_buffer_resources *buffers, @@ -1277,7 +1139,6 @@ static void si_set_constant_buffer(struct si_context *sctx, buffers->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << descriptors_idx; } @@ -1339,7 +1200,6 @@ static void si_set_shader_buffers(struct pipe_context *ctx, pipe_resource_reference(&buffers->buffers[slot], NULL); memset(desc, 0, sizeof(uint32_t) * 4); buffers->enabled_mask &= ~(1u << slot); - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << si_const_and_shader_buffer_descriptors_idx(shader); continue; @@ -1366,7 +1226,6 @@ static void si_set_shader_buffers(struct pipe_context *ctx, buf->bind_history |= PIPE_BIND_SHADER_BUFFER; buffers->enabled_mask |= 1u << slot; - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << si_const_and_shader_buffer_descriptors_idx(shader); @@ -1486,7 +1345,6 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint slot, buffers->enabled_mask &= ~(1u << slot); } - descs->dirty_mask |= 1u << slot; sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; } @@ -1591,7 +1449,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, NULL); buffers->enabled_mask &= ~(1u << bufidx); } - descs->dirty_mask |= 1u << bufidx; } for (; i < old_num_targets; i++) { bufidx = SI_VS_STREAMOUT_BUF0 + i; @@ -1599,7 +1456,6 @@ static void si_set_streamout_targets(struct pipe_context *ctx, memset(descs->list + bufidx*4, 0, sizeof(uint32_t) * 4); pipe_resource_reference(&buffers->buffers[bufidx], NULL); buffers->enabled_mask &= ~(1u << bufidx); - descs->dirty_mask |= 1u << bufidx; } sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; @@ -1718,7 +1574,6 @@ static void si_reset_buffer_resources(struct si_context *sctx, si_desc_reset_buffer_offset(&sctx->b.b, descs->list + i*4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= 1u << descriptors_idx; radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, @@ -1772,7 +1627,6 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf si_desc_reset_buffer_offset(ctx, descs->list + i*4, old_va, buf); - descs->dirty_mask |= 1u << i; sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS; radeon_add_to_buffer_list_check_mem(&sctx->b, &sctx->b.gfx, @@ -1827,7 +1681,6 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf descs->list + desc_slot * 16 + 4, old_va, buf); - descs->dirty_mask |= 1ull << desc_slot; sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); @@ -1860,8 +1713,6 @@ static void si_rebind_buffer(struct pipe_context *ctx, struct pipe_resource *buf si_desc_reset_buffer_offset( ctx, descs->list + desc_slot * 8 + 4, old_va, buf); - /* two 8-byte images share one 16-byte slot */ - descs->dirty_mask |= 1u << (desc_slot / 2); sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); @@ -2328,8 +2179,7 @@ static void si_init_bindless_descriptors(struct si_context *sctx, unsigned shader_userdata_index, unsigned num_elements) { - si_init_descriptors(sctx, desc, shader_userdata_index, 16, num_elements, - 0, 0, NULL); + si_init_descriptors(desc, shader_userdata_index, 16, num_elements); sctx->bindless_descriptors.num_active_slots = num_elements; /* The first bindless descriptor is stored at slot 1, because 0 is not @@ -2753,52 +2603,9 @@ void si_all_resident_buffers_begin_new_cs(struct si_context *sctx) /* INIT/DEINIT/UPLOAD */ -/* GFX9 has only 4KB of CE, while previous chips had 32KB. In order - * to make CE RAM as useful as possible, this defines limits - * for the number slots that can be in CE RAM on GFX9. If a shader - * is using more, descriptors will be uploaded to memory directly and - * CE won't be used. - * - * These numbers are based on shader-db. - */ -static unsigned gfx9_max_ce_samplers[SI_NUM_SHADERS] = { - [PIPE_SHADER_VERTEX] = 0, - [PIPE_SHADER_TESS_CTRL] = 0, - [PIPE_SHADER_TESS_EVAL] = 1, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 24, - [PIPE_SHADER_COMPUTE] = 16, -}; -static unsigned gfx9_max_ce_images[SI_NUM_SHADERS] = { - /* these must be even due to slot alignment */ - [PIPE_SHADER_VERTEX] = 0, - [PIPE_SHADER_TESS_CTRL] = 0, - [PIPE_SHADER_TESS_EVAL] = 0, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 2, - [PIPE_SHADER_COMPUTE] = 8, -}; -static unsigned gfx9_max_ce_const_buffers[SI_NUM_SHADERS] = { - [PIPE_SHADER_VERTEX] = 9, - [PIPE_SHADER_TESS_CTRL] = 3, - [PIPE_SHADER_TESS_EVAL] = 5, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 8, - [PIPE_SHADER_COMPUTE] = 6, -}; -static unsigned gfx9_max_ce_shader_buffers[SI_NUM_SHADERS] = { - [PIPE_SHADER_VERTEX] = 0, - [PIPE_SHADER_TESS_CTRL] = 0, - [PIPE_SHADER_TESS_EVAL] = 0, - [PIPE_SHADER_GEOMETRY] = 0, - [PIPE_SHADER_FRAGMENT] = 12, - [PIPE_SHADER_COMPUTE] = 13, -}; - void si_init_all_descriptors(struct si_context *sctx) { int i; - unsigned ce_offset = 0; STATIC_ASSERT(GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS % 2 == 0); STATIC_ASSERT(GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS % 2 == 0); @@ -2809,49 +2616,28 @@ void si_init_all_descriptors(struct si_context *sctx) unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS; unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS; - unsigned first_sampler_ce_slot = 0; - unsigned num_sampler_ce_slots = num_sampler_slots; - - unsigned first_buffer_ce_slot = 0; - unsigned num_buffer_ce_slots = num_buffer_slots; - - /* Adjust CE slot ranges based on GFX9 CE RAM limits. */ if (sctx->b.chip_class >= GFX9) { gfx9_tcs = i == PIPE_SHADER_TESS_CTRL; gfx9_gs = i == PIPE_SHADER_GEOMETRY; - - first_sampler_ce_slot = - si_get_image_slot(gfx9_max_ce_images[i] - 1) / 2; - num_sampler_ce_slots = gfx9_max_ce_images[i] / 2 + - gfx9_max_ce_samplers[i]; - - first_buffer_ce_slot = - si_get_shaderbuf_slot(gfx9_max_ce_shader_buffers[i] - 1); - num_buffer_ce_slots = gfx9_max_ce_shader_buffers[i] + - gfx9_max_ce_const_buffers[i]; } - si_init_buffer_resources(sctx, &sctx->const_and_shader_buffers[i], + si_init_buffer_resources(&sctx->const_and_shader_buffers[i], si_const_and_shader_buffer_descriptors(sctx, i), num_buffer_slots, - first_buffer_ce_slot, num_buffer_ce_slots, gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS : gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS : SI_SGPR_CONST_AND_SHADER_BUFFERS, RADEON_USAGE_READWRITE, RADEON_USAGE_READ, RADEON_PRIO_SHADER_RW_BUFFER, - RADEON_PRIO_CONST_BUFFER, - &ce_offset); + RADEON_PRIO_CONST_BUFFER); struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i); - si_init_descriptors(sctx, desc, + si_init_descriptors(desc, gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES : gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES : SI_SGPR_SAMPLERS_AND_IMAGES, - 16, num_sampler_slots, - first_sampler_ce_slot, num_sampler_ce_slots, - &ce_offset); + 16, num_sampler_slots); int j; for (j = 0; j < SI_NUM_IMAGES; j++) @@ -2860,19 +2646,17 @@ void si_init_all_descriptors(struct si_context *sctx) memcpy(desc->list + j * 8, null_texture_descriptor, 8 * 4); } - si_init_buffer_resources(sctx, &sctx->rw_buffers, + si_init_buffer_resources(&sctx->rw_buffers, &sctx->descriptors[SI_DESCS_RW_BUFFERS], - SI_NUM_RW_BUFFERS, 0, SI_NUM_RW_BUFFERS, - SI_SGPR_RW_BUFFERS, + SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, /* The second set of usage/priority is used by * const buffers in RW buffer slots. */ RADEON_USAGE_READWRITE, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER, - &ce_offset); + RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER); sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; - si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, - 4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL); + si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, + 4, SI_NUM_VERTEX_BUFFERS); FREE(sctx->vertex_buffers.list); /* not used */ sctx->vertex_buffers.list = NULL; @@ -2884,9 +2668,6 @@ void si_init_all_descriptors(struct si_context *sctx) 1024); sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS); - sctx->total_ce_ram_allocated = ce_offset; - - assert(ce_offset <= si_ce_ram_size(sctx)); /* Set pipe_context functions. */ sctx->b.b.bind_sampler_states = si_bind_sampler_states; @@ -3026,26 +2807,6 @@ void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, first + count > desc->first_active_slot + desc->num_active_slots) sctx->descriptors_dirty |= 1u << desc_idx; - /* Enable or disable CE for this descriptor array. */ - bool used_ce = desc->uses_ce; - desc->uses_ce = desc->first_ce_slot <= first && - desc->first_ce_slot + desc->num_ce_slots >= first + count; - - if (desc->uses_ce != used_ce) { - /* Upload or dump descriptors if we're disabling or enabling CE, - * respectively. */ - sctx->descriptors_dirty |= 1u << desc_idx; - - /* If we're enabling CE, re-upload all descriptors to CE RAM. - * When CE was disabled, uploads to CE RAM stopped. - */ - if (desc->uses_ce) { - desc->dirty_mask |= - u_bit_consecutive64(desc->first_ce_slot, - desc->num_ce_slots); - } - } - desc->first_active_slot = first; desc->num_active_slots = count; } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 7481d013f3c..f3af8dcb446 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -27,40 +27,9 @@ #include "si_pipe.h" #include "radeon/r600_cs.h" -static unsigned si_descriptor_list_cs_space(unsigned count, unsigned element_size) -{ - /* Ensure we have enough space to start a new range in a hole */ - assert(element_size >= 3); - - /* 5 dwords for write to L2 + 3 bytes for the packet header of - * every disjoint range written to CE RAM. - */ - return 5 + (3 * count / 2) + count * element_size; -} - -static unsigned si_ce_needed_cs_space(void) -{ - unsigned space = 0; - - space += si_descriptor_list_cs_space(SI_NUM_SHADER_BUFFERS + - SI_NUM_CONST_BUFFERS, 4); - /* two 8-byte images share one 16-byte slot */ - space += si_descriptor_list_cs_space(SI_NUM_IMAGES / 2 + - SI_NUM_SAMPLERS, 16); - space *= SI_NUM_SHADERS; - - space += si_descriptor_list_cs_space(SI_NUM_RW_BUFFERS, 4); - - /* Increment CE counter packet */ - space += 2; - - return space; -} - void si_destroy_saved_cs(struct si_saved_cs *scs) { radeon_clear_saved_cs(&scs->gfx); - radeon_clear_saved_cs(&scs->ce); r600_resource_reference(&scs->trace_buf, NULL); free(scs); } @@ -69,7 +38,6 @@ void si_destroy_saved_cs(struct si_saved_cs *scs) void si_need_cs_space(struct si_context *ctx) { struct radeon_winsys_cs *cs = ctx->b.gfx.cs; - struct radeon_winsys_cs *ce_ib = ctx->ce_ib; /* There is no need to flush the DMA IB here, because * r600_need_dma_space always flushes the GFX IB if there is @@ -95,8 +63,7 @@ void si_need_cs_space(struct si_context *ctx) /* If the CS is sufficiently large, don't count the space needed * and just flush if there is not enough space left. */ - if (!ctx->b.ws->cs_check_space(cs, 2048) || - (ce_ib && !ctx->b.ws->cs_check_space(ce_ib, si_ce_needed_cs_space()))) + if (!ctx->b.ws->cs_check_space(cs, 2048)) ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } @@ -131,10 +98,6 @@ void si_context_gfx_flush(void *context, unsigned flags, ctx->gfx_flush_in_progress = true; - /* This CE dump should be done in parallel with the last draw. */ - if (ctx->ce_ib) - si_ce_save_all_descriptors_at_ib_end(ctx); - r600_preflush_suspend_features(&ctx->b); ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | @@ -153,8 +116,6 @@ void si_context_gfx_flush(void *context, unsigned flags, /* Save the IB for debug contexts. */ radeon_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true); - if (ctx->ce_ib) - radeon_save_cs(ws, ctx->ce_ib, &ctx->current_saved_cs->ce, false); ctx->current_saved_cs->flushed = true; } @@ -183,7 +144,7 @@ void si_context_gfx_flush(void *context, unsigned flags, static void si_begin_cs_debug(struct si_context *ctx) { - static const uint32_t zeros[2]; + static const uint32_t zeros[1]; assert(!ctx->current_saved_cs); ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs)); @@ -233,14 +194,6 @@ void si_begin_new_cs(struct si_context *ctx) if (ctx->init_config_gs_rings) si_pm4_emit(ctx, ctx->init_config_gs_rings); - if (ctx->ce_preamble_ib) - si_ce_enable_loads(ctx->ce_preamble_ib); - else if (ctx->ce_ib) - si_ce_enable_loads(ctx->ce_ib); - - if (ctx->ce_ib) - si_ce_restore_all_descriptors_at_ib_start(ctx); - if (ctx->queued.named.ls) ctx->prefetch_L2_mask |= SI_PREFETCH_LS; if (ctx->queued.named.hs) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1523eaed941..3ceaaac165a 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -55,10 +55,6 @@ static void si_destroy_context(struct pipe_context *context) si_release_all_descriptors(sctx); - if (sctx->ce_suballocator) - u_suballocator_destroy(sctx->ce_suballocator); - - r600_resource_reference(&sctx->ce_ram_saved_buffer, NULL); pipe_resource_reference(&sctx->esgs_ring, NULL); pipe_resource_reference(&sctx->gsvs_ring, NULL); pipe_resource_reference(&sctx->tf_ring, NULL); @@ -210,45 +206,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, sctx); - - bool enable_ce = sscreen->b.chip_class != SI && /* SI hangs */ - /* These can't use CE due to a power gating bug in the kernel. */ - sscreen->b.family != CHIP_CARRIZO && - sscreen->b.family != CHIP_STONEY; - - /* CE is currently disabled by default, because it makes s_load latency - * worse, because CE IB doesn't run in lockstep with DE. - * Remove this line after that performance issue has been resolved. - */ - enable_ce = false; - - /* Apply CE overrides. */ - if (sscreen->b.debug_flags & DBG_NO_CE) - enable_ce = false; - else if (sscreen->b.debug_flags & DBG_CE) - enable_ce = true; - - if (ws->cs_add_const_ib && enable_ce) { - sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs); - if (!sctx->ce_ib) - goto fail; - - if (ws->cs_add_const_preamble_ib) { - sctx->ce_preamble_ib = - ws->cs_add_const_preamble_ib(sctx->b.gfx.cs); - - if (!sctx->ce_preamble_ib) - goto fail; - } - - sctx->ce_suballocator = - u_suballocator_create(&sctx->b.b, 1024 * 1024, 0, - PIPE_USAGE_DEFAULT, - R600_RESOURCE_FLAG_UNMAPPABLE, false); - if (!sctx->ce_suballocator) - goto fail; - } - sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index fd99c975ad4..69a35ea1945 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -266,12 +266,10 @@ struct si_saved_cs { struct pipe_reference reference; struct si_context *ctx; struct radeon_saved_cs gfx; - struct radeon_saved_cs ce; struct r600_resource *trace_buf; unsigned trace_id; unsigned gfx_last_dw; - unsigned ce_last_dw; bool flushed; }; @@ -288,15 +286,7 @@ struct si_context { struct si_shader_ctx_state fixed_func_tcs_shader; struct r600_resource *wait_mem_scratch; unsigned wait_mem_number; - - struct radeon_winsys_cs *ce_ib; - struct radeon_winsys_cs *ce_preamble_ib; - struct r600_resource *ce_ram_saved_buffer; - struct u_suballocator *ce_suballocator; - unsigned ce_ram_saved_offset; - uint16_t total_ce_ram_allocated; uint16_t prefetch_L2_mask; - bool ce_need_synchronization:1; bool gfx_flush_in_progress:1; bool compute_is_busy:1; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 2b3c37fa16d..ca701658d0b 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -224,8 +224,6 @@ struct si_descriptors { uint32_t *list; /* The list in mapped GPU memory. */ uint32_t *gpu_list; - /* Slots that have been changed and need to be uploaded. */ - uint64_t dirty_mask; /* The buffer where the descriptors have been uploaded. */ struct r600_resource *buffer; @@ -236,27 +234,12 @@ struct si_descriptors { /* The maximum number of descriptors. */ uint32_t num_elements; - /* Offset in CE RAM */ - uint16_t ce_offset; - - /* Slots allocated in CE RAM. If we get active slots outside of this - * range, direct uploads to memory will be used instead. This basically - * governs switching between onchip (CE) and offchip (upload) modes. - */ - uint32_t first_ce_slot; - uint32_t num_ce_slots; - /* Slots that are used by currently-bound shaders. - * With CE: It determines which slots are dumped to L2. - * It doesn't skip uploads to CE RAM. - * Without CE: It determines which slots are uploaded. + * It determines which slots are uploaded. */ uint32_t first_active_slot; uint32_t num_active_slots; - /* Whether CE is used to upload this descriptor array. */ - bool uses_ce; - /* The SGPR index where the 64-bit pointer to the descriptor array will * be stored. */ ubyte shader_userdata_offset; @@ -307,9 +290,6 @@ struct si_buffer_resources { } while(0) /* si_descriptors.c */ -void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx); -void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx); -void si_ce_enable_loads(struct radeon_winsys_cs *ib); void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct r600_texture *tex, const struct legacy_surf_level *base_level_info, @@ -401,8 +381,6 @@ void si_get_active_slot_masks(const struct tgsi_shader_info *info, /* si_state_draw.c */ void si_init_ia_multi_vgt_param_table(struct si_context *sctx); void si_emit_cache_flush(struct si_context *sctx); -void si_ce_pre_draw_synchronization(struct si_context *sctx); -void si_ce_post_draw_synchronization(struct si_context *sctx); void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo); void si_trace_emit(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cb9bbd20805..f2b889677a5 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1145,27 +1145,6 @@ static void si_get_draw_start_count(struct si_context *sctx, } } -void si_ce_pre_draw_synchronization(struct si_context *sctx) -{ - if (sctx->ce_need_synchronization) { - radeon_emit(sctx->ce_ib, PKT3(PKT3_INCREMENT_CE_COUNTER, 0, 0)); - radeon_emit(sctx->ce_ib, 1); /* 1 = increment CE counter */ - - radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_WAIT_ON_CE_COUNTER, 0, 0)); - radeon_emit(sctx->b.gfx.cs, 0); /* 0 = don't flush sL1 conditionally */ - } -} - -void si_ce_post_draw_synchronization(struct si_context *sctx) -{ - if (sctx->ce_need_synchronization) { - radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0)); - radeon_emit(sctx->b.gfx.cs, 0); /* unused */ - - sctx->ce_need_synchronization = false; - } -} - static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info, unsigned skip_atom_mask) { @@ -1413,7 +1392,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) sctx->dirty_atoms = 0; } - si_ce_pre_draw_synchronization(sctx); si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* <-- CUs are busy here. */ @@ -1436,12 +1414,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) return; si_emit_all_states(sctx, info, 0); - si_ce_pre_draw_synchronization(sctx); si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); } - si_ce_post_draw_synchronization(sctx); - if (unlikely(sctx->current_saved_cs)) si_trace_emit(sctx); @@ -1485,20 +1460,6 @@ void si_trace_emit(struct si_context *sctx) radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id)); - if (sctx->ce_ib) { - struct radeon_winsys_cs *ce = sctx->ce_ib; - - radeon_emit(ce, PKT3(PKT3_WRITE_DATA, 3, 0)); - radeon_emit(ce, S_370_DST_SEL(V_370_MEM_ASYNC) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(V_370_CE)); - radeon_emit(ce, va + 4); - radeon_emit(ce, (va + 4) >> 32); - radeon_emit(ce, trace_id); - radeon_emit(ce, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(ce, AC_ENCODE_TRACE_POINT(trace_id)); - } - if (sctx->b.log) u_log_flush(sctx->b.log); } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index d2662538865..9cadfc4298d 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -566,12 +566,6 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) buffer_size = MIN2(buffer_size, 4 * 512 * 1024); switch (ib->ib_type) { - case IB_CONST_PREAMBLE: - buffer_size = MAX2(buffer_size, 4 * 1024); - break; - case IB_CONST: - buffer_size = MAX2(buffer_size, 16 * 1024 * 4); - break; case IB_MAIN: buffer_size = MAX2(buffer_size, 8 * 1024 * 4); break; @@ -609,13 +603,6 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type) * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 */ return 20 * 1024; - case IB_CONST_PREAMBLE: - case IB_CONST: - /* There isn't really any reason to limit CE IB size beyond the natural - * limit implied by the main IB, except perhaps GTT size. Just return - * an extremely large value that we never get anywhere close to. - */ - return 16 * 1024 * 1024; default: unreachable("bad ib_type"); } @@ -634,14 +621,6 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, unsigned ib_size = 0; switch (ib_type) { - case IB_CONST_PREAMBLE: - ib = &cs->const_preamble_ib; - ib_size = 256 * 4; - break; - case IB_CONST: - ib = &cs->const_ib; - ib_size = 8 * 1024 * 4; - break; case IB_MAIN: ib = &cs->main; ib_size = 4 * 1024 * 4; @@ -728,10 +707,6 @@ static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs, cs->request.number_of_ibs = 1; cs->request.ibs = &cs->ib[IB_MAIN]; - cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE; - cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | - AMDGPU_IB_FLAG_PREAMBLE; - return true; } @@ -799,8 +774,6 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, cs->ring_type = ring_type; cs->main.ib_type = IB_MAIN; - cs->const_ib.ib_type = IB_CONST; - cs->const_preamble_ib.ib_type = IB_CONST_PREAMBLE; if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) { FREE(cs); @@ -828,52 +801,6 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, return &cs->main.base; } -static struct radeon_winsys_cs * -amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs) -{ - struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; - struct amdgpu_winsys *ws = cs->ctx->ws; - - /* only one const IB can be added */ - if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped) - return NULL; - - if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST)) - return NULL; - - cs->csc->request.number_of_ibs = 2; - cs->csc->request.ibs = &cs->csc->ib[IB_CONST]; - - cs->cst->request.number_of_ibs = 2; - cs->cst->request.ibs = &cs->cst->ib[IB_CONST]; - - return &cs->const_ib.base; -} - -static struct radeon_winsys_cs * -amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs) -{ - struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; - struct amdgpu_winsys *ws = cs->ctx->ws; - - /* only one const preamble IB can be added and only when the const IB has - * also been mapped */ - if (cs->ring_type != RING_GFX || !cs->const_ib.ib_mapped || - cs->const_preamble_ib.ib_mapped) - return NULL; - - if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE)) - return NULL; - - cs->csc->request.number_of_ibs = 3; - cs->csc->request.ibs = &cs->csc->ib[IB_CONST_PREAMBLE]; - - cs->cst->request.number_of_ibs = 3; - cs->cst->request.ibs = &cs->cst->ib[IB_CONST_PREAMBLE]; - - return &cs->const_preamble_ib.base; -} - static bool amdgpu_cs_validate(struct radeon_winsys_cs *rcs) { return true; @@ -1323,15 +1250,6 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, while (rcs->current.cdw & 7) radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ } - - /* Also pad the const IB. */ - if (cs->const_ib.ib_mapped) - while (!cs->const_ib.base.current.cdw || (cs->const_ib.base.current.cdw & 7)) - radeon_emit(&cs->const_ib.base, 0xffff1000); /* type3 nop packet */ - - if (cs->const_preamble_ib.ib_mapped) - while (!cs->const_preamble_ib.base.current.cdw || (cs->const_preamble_ib.base.current.cdw & 7)) - radeon_emit(&cs->const_preamble_ib.base, 0xffff1000); break; case RING_UVD: while (rcs->current.cdw & 15) @@ -1358,12 +1276,6 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, /* Set IB sizes. */ amdgpu_ib_finalize(&cs->main); - if (cs->const_ib.ib_mapped) - amdgpu_ib_finalize(&cs->const_ib); - - if (cs->const_preamble_ib.ib_mapped) - amdgpu_ib_finalize(&cs->const_preamble_ib); - /* Create a fence. */ amdgpu_fence_reference(&cur->fence, NULL); if (cs->next_fence) { @@ -1409,10 +1321,6 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, } amdgpu_get_new_ib(&ws->base, cs, IB_MAIN); - if (cs->const_ib.ib_mapped) - amdgpu_get_new_ib(&ws->base, cs, IB_CONST); - if (cs->const_preamble_ib.ib_mapped) - amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE); cs->main.base.used_gart = 0; cs->main.base.used_vram = 0; @@ -1434,10 +1342,6 @@ static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) p_atomic_dec(&cs->ctx->ws->num_cs); pb_reference(&cs->main.big_ib_buffer, NULL); FREE(cs->main.base.prev); - pb_reference(&cs->const_ib.big_ib_buffer, NULL); - FREE(cs->const_ib.base.prev); - pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL); - FREE(cs->const_preamble_ib.base.prev); amdgpu_destroy_cs_context(&cs->csc1); amdgpu_destroy_cs_context(&cs->csc2); amdgpu_fence_reference(&cs->next_fence, NULL); @@ -1460,8 +1364,6 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws) ws->base.ctx_destroy = amdgpu_ctx_destroy; ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; - ws->base.cs_add_const_ib = amdgpu_cs_add_const_ib; - ws->base.cs_add_const_preamble_ib = amdgpu_cs_add_const_preamble_ib; ws->base.cs_destroy = amdgpu_cs_destroy; ws->base.cs_add_buffer = amdgpu_cs_add_buffer; ws->base.cs_validate = amdgpu_cs_validate; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index d83c1e0fe19..8f5c33678eb 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -59,10 +59,8 @@ struct amdgpu_cs_buffer { }; enum ib_type { - IB_CONST_PREAMBLE = 0, - IB_CONST = 1, /* the const IB must be first */ - IB_MAIN = 2, - IB_NUM + IB_MAIN, + IB_NUM, }; struct amdgpu_ib { @@ -117,8 +115,6 @@ struct amdgpu_cs_context { struct amdgpu_cs { struct amdgpu_ib main; /* must be first because this is inherited */ - struct amdgpu_ib const_ib; /* optional constant engine IB */ - struct amdgpu_ib const_preamble_ib; struct amdgpu_ctx *ctx; enum ring_type ring_type; @@ -199,10 +195,6 @@ amdgpu_cs_from_ib(struct amdgpu_ib *ib) switch (ib->ib_type) { case IB_MAIN: return get_container(ib, struct amdgpu_cs, main); - case IB_CONST: - return get_container(ib, struct amdgpu_cs, const_ib); - case IB_CONST_PREAMBLE: - return get_container(ib, struct amdgpu_cs, const_preamble_ib); default: unreachable("bad ib_type"); } -- 2.30.2