return NULL;
}
- alloc_buffer.alloc_size = 4 * 1024;
- alloc_buffer.phys_alignment = 4 *1024;
+ alloc_buffer.alloc_size = ctx->ws->info.gart_page_size;
+ alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size;
alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
/* COMMAND SUBMISSION */
-static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
+static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_ib *ib,
+ struct amdgpu_cs_ib_info *info, unsigned ib_type)
{
+ struct amdgpu_winsys *aws = (struct amdgpu_winsys*)ws;
/* Small IBs are better than big IBs, because the GPU goes idle quicker
* and there is less waiting for buffers and fences. Proof:
* http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
*/
- const unsigned buffer_size = 128 * 1024 * 4;
- const unsigned ib_size = 20 * 1024 * 4;
+ unsigned buffer_size, ib_size;
- cs->base.cdw = 0;
- cs->base.buf = NULL;
+ switch (ib_type) {
+ case IB_CONST_PREAMBLE:
+ buffer_size = 4 * 1024 * 4;
+ ib_size = 1024 * 4;
+ break;
+ case IB_CONST:
+ buffer_size = 512 * 1024 * 4;
+ ib_size = 128 * 1024 * 4;
+ break;
+ case IB_MAIN:
+ buffer_size = 128 * 1024 * 4;
+ ib_size = 20 * 1024 * 4;
+ break;
+ default:
+ unreachable("unhandled IB type");
+ }
+
+ ib->base.cdw = 0;
+ ib->base.buf = NULL;
/* Allocate a new buffer for IBs if the current buffer is all used. */
- if (!cs->big_ib_buffer ||
- cs->used_ib_space + ib_size > cs->big_ib_buffer->size) {
- struct radeon_winsys *ws = &cs->ctx->ws->base;
- struct radeon_winsys_cs_handle *winsys_bo;
-
- pb_reference(&cs->big_ib_buffer, NULL);
- cs->big_ib_winsys_buffer = NULL;
- cs->ib_mapped = NULL;
- cs->used_ib_space = 0;
-
- cs->big_ib_buffer = ws->buffer_create(ws, buffer_size,
- 4096, true,
+ if (!ib->big_ib_buffer ||
+ ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
+
+ pb_reference(&ib->big_ib_buffer, NULL);
+ ib->ib_mapped = NULL;
+ ib->used_ib_space = 0;
+
+ ib->big_ib_buffer = ws->buffer_create(ws, buffer_size,
+ aws->info.gart_page_size,
RADEON_DOMAIN_GTT,
RADEON_FLAG_CPU_ACCESS);
- if (!cs->big_ib_buffer)
+ if (!ib->big_ib_buffer)
return false;
- winsys_bo = ws->buffer_get_cs_handle(cs->big_ib_buffer);
-
- cs->ib_mapped = ws->buffer_map(winsys_bo, NULL, PIPE_TRANSFER_WRITE);
- if (!cs->ib_mapped) {
- pb_reference(&cs->big_ib_buffer, NULL);
+ ib->ib_mapped = ws->buffer_map(ib->big_ib_buffer, NULL,
+ PIPE_TRANSFER_WRITE);
+ if (!ib->ib_mapped) {
+ pb_reference(&ib->big_ib_buffer, NULL);
return false;
}
-
- cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)winsys_bo;
}
- cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space;
- cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space);
- cs->base.max_dw = ib_size / 4;
+ info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va +
+ ib->used_ib_space;
+ ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
+ ib->base.max_dw = ib_size / 4;
return true;
}
break;
}
- cs->request.number_of_ibs = 1;
- cs->request.ibs = &cs->ib;
-
cs->max_num_buffers = 512;
cs->buffers = (struct amdgpu_cs_buffer*)
CALLOC(1, cs->max_num_buffers * sizeof(struct amdgpu_cs_buffer));
return FALSE;
}
- for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
+ for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) {
cs->buffer_indices_hashlist[i] = -1;
}
return TRUE;
cs->used_gart = 0;
cs->used_vram = 0;
- for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
+ for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) {
cs->buffer_indices_hashlist[i] = -1;
}
}
enum ring_type ring_type,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
- void *flush_ctx,
- struct radeon_winsys_cs_handle *trace_buf)
+ void *flush_ctx)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
struct amdgpu_cs *cs;
cs->ctx = ctx;
cs->flush_cs = flush;
cs->flush_data = flush_ctx;
- cs->base.ring_type = ring_type;
+ cs->ring_type = ring_type;
if (!amdgpu_init_cs_context(cs, ring_type)) {
FREE(cs);
return NULL;
}
- if (!amdgpu_get_new_ib(cs)) {
+ if (!amdgpu_get_new_ib(&ctx->ws->base, &cs->main, &cs->ib[IB_MAIN], IB_MAIN)) {
amdgpu_destroy_cs_context(cs);
FREE(cs);
return NULL;
}
+ cs->request.number_of_ibs = 1;
+ cs->request.ibs = &cs->ib[IB_MAIN];
+
p_atomic_inc(&ctx->ws->num_cs);
- return &cs->base;
+ return &cs->main.base;
+}
+
+static struct radeon_winsys_cs *
+amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs)
+{
+ struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+ struct amdgpu_winsys *ws = cs->ctx->ws;
+
+ /* only one const IB can be added */
+ if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped)
+ return NULL;
+
+ if (!amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST], IB_CONST))
+ return NULL;
+
+ cs->request.number_of_ibs = 2;
+ cs->request.ibs = &cs->ib[IB_CONST];
+ cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE;
+
+ return &cs->const_ib.base;
+}
+
+static struct radeon_winsys_cs *
+amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs)
+{
+ struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+ struct amdgpu_winsys *ws = cs->ctx->ws;
+
+ /* only one const preamble IB can be added and only when the const IB has
+ * also been mapped */
+ if (cs->ring_type != RING_GFX || !cs->const_ib.ib_mapped ||
+ cs->const_preamble_ib.ib_mapped)
+ return NULL;
+
+ if (!amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib,
+ &cs->ib[IB_CONST_PREAMBLE], IB_CONST_PREAMBLE))
+ return NULL;
+
+ cs->request.number_of_ibs = 3;
+ cs->request.ibs = &cs->ib[IB_CONST_PREAMBLE];
+ cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | AMDGPU_IB_FLAG_PREAMBLE;
+
+ return &cs->const_preamble_ib.base;
}
#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
-int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
+int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
{
- unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
+ unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
int i = cs->buffer_indices_hashlist[hash];
/* not found or found */
if (i == -1 || cs->buffers[i].bo == bo)
return i;
- /* Hash collision, look for the BO in the list of relocs linearly. */
+ /* Hash collision, look for the BO in the list of buffers linearly. */
for (i = cs->num_buffers - 1; i >= 0; i--) {
if (cs->buffers[i].bo == bo) {
- /* Put this reloc in the hash list.
+ /* Put this buffer in the hash list.
* This will prevent additional hash collisions if there are
- * several consecutive get_reloc calls for the same buffer.
+ * several consecutive lookup_buffer calls for the same buffer.
*
* Example: Assuming buffers A,B,C collide in the hash list,
- * the following sequence of relocs:
+ * the following sequence of buffers:
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
* will collide here: ^ and here: ^,
* meaning that we should get very few collisions in the end. */
return -1;
}
-static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs,
+static unsigned amdgpu_add_buffer(struct amdgpu_cs *cs,
struct amdgpu_winsys_bo *bo,
enum radeon_bo_usage usage,
enum radeon_bo_domain domains,
unsigned priority,
enum radeon_bo_domain *added_domains)
{
- struct amdgpu_cs_buffer *reloc;
- unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
+ struct amdgpu_cs_buffer *buffer;
+ unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
int i = -1;
- priority = MIN2(priority, 15);
+ assert(priority < 64);
*added_domains = 0;
- i = amdgpu_get_reloc(cs, bo);
+ i = amdgpu_lookup_buffer(cs, bo);
if (i >= 0) {
- reloc = &cs->buffers[i];
- reloc->usage |= usage;
- *added_domains = domains & ~reloc->domains;
- reloc->domains |= domains;
- cs->flags[i] = MAX2(cs->flags[i], priority);
+ buffer = &cs->buffers[i];
+ buffer->priority_usage |= 1llu << priority;
+ buffer->usage |= usage;
+ *added_domains = domains & ~buffer->domains;
+ buffer->domains |= domains;
+ cs->flags[i] = MAX2(cs->flags[i], priority / 4);
return i;
}
- /* New relocation, check if the backing array is large enough. */
+ /* New buffer, check if the backing array is large enough. */
if (cs->num_buffers >= cs->max_num_buffers) {
uint32_t size;
cs->max_num_buffers += 10;
cs->flags = realloc(cs->flags, cs->max_num_buffers);
}
- /* Initialize the new relocation. */
+ /* Initialize the new buffer. */
cs->buffers[cs->num_buffers].bo = NULL;
amdgpu_winsys_bo_reference(&cs->buffers[cs->num_buffers].bo, bo);
cs->handles[cs->num_buffers] = bo->bo;
- cs->flags[cs->num_buffers] = priority;
+ cs->flags[cs->num_buffers] = priority / 4;
p_atomic_inc(&bo->num_cs_references);
- reloc = &cs->buffers[cs->num_buffers];
- reloc->bo = bo;
- reloc->usage = usage;
- reloc->domains = domains;
+ buffer = &cs->buffers[cs->num_buffers];
+ buffer->bo = bo;
+ buffer->priority_usage = 1llu << priority;
+ buffer->usage = usage;
+ buffer->domains = domains;
cs->buffer_indices_hashlist[hash] = cs->num_buffers;
return cs->num_buffers++;
}
-static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs,
- struct radeon_winsys_cs_handle *buf,
+static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
+ struct pb_buffer *buf,
enum radeon_bo_usage usage,
enum radeon_bo_domain domains,
enum radeon_bo_priority priority)
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
enum radeon_bo_domain added_domains;
- unsigned index = amdgpu_add_reloc(cs, bo, usage, bo->initial_domain,
+ unsigned index = amdgpu_add_buffer(cs, bo, usage, bo->initial_domain,
priority, &added_domains);
- if (added_domains & RADEON_DOMAIN_GTT)
- cs->used_gart += bo->base.size;
if (added_domains & RADEON_DOMAIN_VRAM)
cs->used_vram += bo->base.size;
+ else if (added_domains & RADEON_DOMAIN_GTT)
+ cs->used_gart += bo->base.size;
return index;
}
-static int amdgpu_cs_get_reloc(struct radeon_winsys_cs *rcs,
- struct radeon_winsys_cs_handle *buf)
+static int amdgpu_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
+ struct pb_buffer *buf)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
- return amdgpu_get_reloc(cs, (struct amdgpu_winsys_bo*)buf);
+ return amdgpu_lookup_buffer(cs, (struct amdgpu_winsys_bo*)buf);
}
static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
- boolean status =
- (cs->used_gart + gtt) < cs->ctx->ws->info.gart_size * 0.7 &&
- (cs->used_vram + vram) < cs->ctx->ws->info.vram_size * 0.7;
+ struct amdgpu_winsys *ws = cs->ctx->ws;
+
+ vram += cs->used_vram;
+ gtt += cs->used_gart;
+
+ /* Anything that goes above the VRAM size should go to GTT. */
+ if (vram > ws->info.vram_size)
+ gtt += vram - ws->info.vram_size;
+
+ /* Now we just need to check if we have enough GTT. */
+ return gtt < ws->info.gart_size * 0.7;
+}
+
+static uint64_t amdgpu_cs_query_memory_usage(struct radeon_winsys_cs *rcs)
+{
+ struct amdgpu_cs *cs = amdgpu_cs(rcs);
+
+ return cs->used_vram + cs->used_gart;
+}
- return status;
+static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
+ struct radeon_bo_list_item *list)
+{
+ struct amdgpu_cs *cs = amdgpu_cs(rcs);
+ int i;
+
+ if (list) {
+ for (i = 0; i < cs->num_buffers; i++) {
+ pb_reference(&list[i].buf, &cs->buffers[i].bo->base);
+ list[i].vm_address = cs->buffers[i].bo->va;
+ list[i].priority_usage = cs->buffers[i].priority_usage;
+ }
+ }
+ return cs->num_buffers;
}
static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
cs->request.fence_info.handle = NULL;
if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE) {
cs->request.fence_info.handle = cs->ctx->user_fence_bo;
- cs->request.fence_info.offset = cs->base.ring_type;
+ cs->request.fence_info.offset = cs->ring_type;
}
r = amdgpu_cs_submit(cs->ctx->ctx, 0, &cs->request, 1);
amdgpu_fence_submitted(fence, &cs->request, user_fence);
for (i = 0; i < cs->num_buffers; i++)
- amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->base.ring_type],
+ amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->ring_type],
fence);
}
pipe_mutex_unlock(ws->bo_fence_lock);
}
DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
unsigned flags,
- struct pipe_fence_handle **fence,
- uint32_t cs_trace_id)
+ struct pipe_fence_handle **fence)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys *ws = cs->ctx->ws;
- switch (cs->base.ring_type) {
+ switch (cs->ring_type) {
case RING_DMA:
/* pad DMA ring to 8 DWs */
while (rcs->cdw & 7)
- OUT_CS(&cs->base, 0x00000000); /* NOP packet */
+ OUT_CS(rcs, 0x00000000); /* NOP packet */
break;
case RING_GFX:
/* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
while (rcs->cdw & 7)
- OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
+ OUT_CS(rcs, 0xffff1000); /* type3 nop packet */
+
+ /* Also pad the const IB. */
+ if (cs->const_ib.ib_mapped)
+ while (!cs->const_ib.base.cdw || (cs->const_ib.base.cdw & 7))
+ OUT_CS(&cs->const_ib.base, 0xffff1000); /* type3 nop packet */
+
+ if (cs->const_preamble_ib.ib_mapped)
+ while (!cs->const_preamble_ib.base.cdw || (cs->const_preamble_ib.base.cdw & 7))
+ OUT_CS(&cs->const_preamble_ib.base, 0xffff1000);
break;
case RING_UVD:
while (rcs->cdw & 15)
- OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
+ OUT_CS(rcs, 0x80000000); /* type2 nop packet */
break;
default:
break;
fprintf(stderr, "amdgpu: command stream overflowed\n");
}
- amdgpu_cs_add_reloc(rcs, (void*)cs->big_ib_winsys_buffer,
- RADEON_USAGE_READ, 0, RADEON_PRIO_MIN);
+ amdgpu_cs_add_buffer(rcs, cs->main.big_ib_buffer,
+ RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
+
+ if (cs->const_ib.ib_mapped)
+ amdgpu_cs_add_buffer(rcs, cs->const_ib.big_ib_buffer,
+ RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
+
+ if (cs->const_preamble_ib.ib_mapped)
+ amdgpu_cs_add_buffer(rcs, cs->const_preamble_ib.big_ib_buffer,
+ RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
/* If the CS is not empty or overflowed.... */
- if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
+ if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw && !debug_get_option_noop()) {
int r;
- r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
- cs->handles, cs->flags,
- &cs->request.resources);
+ /* Use a buffer list containing all allocated buffers if requested. */
+ if (debug_get_option_all_bos()) {
+ struct amdgpu_winsys_bo *bo;
+ amdgpu_bo_handle *handles;
+ unsigned num = 0;
+
+ pipe_mutex_lock(ws->global_bo_list_lock);
+
+ handles = malloc(sizeof(handles[0]) * ws->num_buffers);
+ if (!handles) {
+ pipe_mutex_unlock(ws->global_bo_list_lock);
+ goto cleanup;
+ }
+
+ LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
+ assert(num < ws->num_buffers);
+ handles[num++] = bo->bo;
+ }
+
+ r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
+ handles, NULL,
+ &cs->request.resources);
+ free(handles);
+ pipe_mutex_unlock(ws->global_bo_list_lock);
+ } else {
+ r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
+ cs->handles, cs->flags,
+ &cs->request.resources);
+ }
if (r) {
fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", r);
goto cleanup;
}
- cs->ib.size = cs->base.cdw;
- cs->used_ib_space += cs->base.cdw * 4;
+ cs->ib[IB_MAIN].size = cs->main.base.cdw;
+ cs->main.used_ib_space += cs->main.base.cdw * 4;
+
+ if (cs->const_ib.ib_mapped) {
+ cs->ib[IB_CONST].size = cs->const_ib.base.cdw;
+ cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4;
+ }
+
+ if (cs->const_preamble_ib.ib_mapped) {
+ cs->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw;
+ cs->const_preamble_ib.used_ib_space += cs->const_preamble_ib.base.cdw * 4;
+ }
amdgpu_cs_do_submission(cs, fence);
cleanup:
amdgpu_cs_context_cleanup(cs);
- amdgpu_get_new_ib(cs);
+
+ amdgpu_get_new_ib(&ws->base, &cs->main, &cs->ib[IB_MAIN], IB_MAIN);
+ if (cs->const_ib.ib_mapped)
+ amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST], IB_CONST);
+ if (cs->const_preamble_ib.ib_mapped)
+ amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib,
+ &cs->ib[IB_CONST_PREAMBLE], IB_CONST_PREAMBLE);
ws->num_cs_flushes++;
}
amdgpu_destroy_cs_context(cs);
p_atomic_dec(&cs->ctx->ws->num_cs);
- pb_reference(&cs->big_ib_buffer, NULL);
+ pb_reference(&cs->main.big_ib_buffer, NULL);
+ pb_reference(&cs->const_ib.big_ib_buffer, NULL);
+ pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL);
FREE(cs);
}
static boolean amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs,
- struct radeon_winsys_cs_handle *_buf,
+ struct pb_buffer *_buf,
enum radeon_bo_usage usage)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
ws->base.ctx_destroy = amdgpu_ctx_destroy;
ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
ws->base.cs_create = amdgpu_cs_create;
+ ws->base.cs_add_const_ib = amdgpu_cs_add_const_ib;
+ ws->base.cs_add_const_preamble_ib = amdgpu_cs_add_const_preamble_ib;
ws->base.cs_destroy = amdgpu_cs_destroy;
- ws->base.cs_add_reloc = amdgpu_cs_add_reloc;
- ws->base.cs_get_reloc = amdgpu_cs_get_reloc;
+ ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
+ ws->base.cs_lookup_buffer = amdgpu_cs_lookup_buffer;
ws->base.cs_validate = amdgpu_cs_validate;
ws->base.cs_memory_below_limit = amdgpu_cs_memory_below_limit;
+ ws->base.cs_query_memory_usage = amdgpu_cs_query_memory_usage;
+ ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
ws->base.cs_flush = amdgpu_cs_flush;
ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
ws->base.cs_sync_flush = amdgpu_cs_sync_flush;