From fedd0a4215bcd387525000d76b77993ca38916ae Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 20 Apr 2018 13:42:36 +0200 Subject: [PATCH] radv/winsys: allow to submit up to 4 IBs for chips without chaining The SI family doesn't support chaining which means the maximum size in dwords per CS is limited. When that limit was reached we failed to submit the CS and the application crashed. This patch allows to submit up to 4 IBs which is currently the limit, but recent amdgpu supports more than that. Please note that we can reach the limit of 4 IBs per submit but currently we can't improve that. The only solution is to upgrade libdrm. That will be improved later but for now this should fix crashes on SI or when using RADV_DEBUG=noibs. Fixes: 36cb5508e89 ("radv/winsys: Fail early on overgrown cs.") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105775 Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 218 ++++++++++++++---- 1 file changed, 168 insertions(+), 50 deletions(-) diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index c4b2232ce9e..0cd870b7c89 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -68,6 +68,10 @@ struct radv_amdgpu_cs { struct radeon_winsys_bo **virtual_buffers; uint8_t *virtual_buffer_priorities; int *virtual_buffer_hash_table; + + /* For chips that don't support chaining. */ + struct radeon_winsys_cs *old_cs_buffers; + unsigned num_old_cs_buffers; }; static inline struct radv_amdgpu_cs * @@ -201,6 +205,12 @@ static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i) cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]); + for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) { + struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i]; + free(rcs->buf); + } + + free(cs->old_cs_buffers); free(cs->old_ib_buffers); free(cs->virtual_buffers); free(cs->virtual_buffer_priorities); @@ -286,9 +296,46 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size) /* The total ib size cannot exceed limit_dws dwords. */ if (ib_dws > limit_dws) { - cs->failed = true; + /* The maximum size in dwords has been reached, + * try to allocate a new one. + */ + if (cs->num_old_cs_buffers + 1 >= AMDGPU_CS_MAX_IBS_PER_SUBMIT) { + /* TODO: Allow to submit more than 4 IBs. */ + fprintf(stderr, "amdgpu: Maximum number of IBs " + "per submit reached.\n"); + cs->failed = true; + cs->base.cdw = 0; + return; + } + + cs->old_cs_buffers = + realloc(cs->old_cs_buffers, + (cs->num_old_cs_buffers + 1) * sizeof(*cs->old_cs_buffers)); + if (!cs->old_cs_buffers) { + cs->failed = true; + cs->base.cdw = 0; + return; + } + + /* Store the current one for submitting it later. */ + cs->old_cs_buffers[cs->num_old_cs_buffers].cdw = cs->base.cdw; + cs->old_cs_buffers[cs->num_old_cs_buffers].max_dw = cs->base.max_dw; + cs->old_cs_buffers[cs->num_old_cs_buffers].buf = cs->base.buf; + cs->num_old_cs_buffers++; + + /* Reset the cs, it will be re-allocated below. */ cs->base.cdw = 0; - return; + cs->base.buf = NULL; + + /* Re-compute the number of dwords to allocate. */ + ib_dws = MAX2(cs->base.cdw + min_size, + MIN2(cs->base.max_dw * 2, limit_dws)); + if (ib_dws > limit_dws) { + fprintf(stderr, "amdgpu: Too high number of " + "dwords to allocate\n"); + cs->failed = true; + return; + } } uint32_t *new_buf = realloc(cs->base.buf, ib_dws * 4); @@ -400,6 +447,15 @@ static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs) cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va; cs->ib_size_ptr = &cs->ib.size; cs->ib.size = 0; + } else { + for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) { + struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i]; + free(rcs->buf); + } + + free(cs->old_cs_buffers); + cs->old_cs_buffers = NULL; + cs->num_old_cs_buffers = 0; } } @@ -550,7 +606,8 @@ static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent, static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, struct radeon_winsys_cs **cs_array, unsigned count, - struct radv_amdgpu_winsys_bo *extra_bo, + struct radv_amdgpu_winsys_bo **extra_bo_array, + unsigned num_extra_bo, struct radeon_winsys_cs *extra_cs, const struct radv_winsys_bo_list *radv_bo_list, amdgpu_bo_list_handle *bo_list) @@ -580,7 +637,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, bo_list); free(handles); pthread_mutex_unlock(&ws->global_bo_list_lock); - } else if (count == 1 && !extra_bo && !extra_cs && !radv_bo_list && + } else if (count == 1 && !num_extra_bo && !extra_cs && !radv_bo_list && !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) { struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0]; if (cs->num_buffers == 0) { @@ -590,8 +647,8 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles, cs->priorities, bo_list); } else { - unsigned total_buffer_count = !!extra_bo; - unsigned unique_bo_count = !!extra_bo; + unsigned total_buffer_count = num_extra_bo; + unsigned unique_bo_count = num_extra_bo; for (unsigned i = 0; i < count; ++i) { struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i]; total_buffer_count += cs->num_buffers; @@ -619,9 +676,9 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, return -ENOMEM; } - if (extra_bo) { - handles[0] = extra_bo->bo; - priorities[0] = 8; + for (unsigned i = 0; i < num_extra_bo; i++) { + handles[i] = extra_bo_array[i]->bo; + priorities[i] = 8; } for (unsigned i = 0; i < count + !!extra_cs; ++i) { @@ -773,7 +830,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx, } } - r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs, + r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, 0, initial_preamble_cs, radv_bo_list, &bo_list); if (r) { fprintf(stderr, "amdgpu: buffer list creation failed for the " @@ -842,7 +899,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx, memset(&request, 0, sizeof(request)); - r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL, + r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL, 0, preamble_cs, radv_bo_list, &bo_list); if (r) { fprintf(stderr, "amdgpu: buffer list creation failed " @@ -923,68 +980,127 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx, assert(cs_count); for (unsigned i = 0; i < cs_count;) { - struct amdgpu_cs_ib_info ib = {0}; - struct radeon_winsys_bo *bo = NULL; + struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0}; + unsigned number_of_ibs = 1; + struct radeon_winsys_bo *bos[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0}; struct radeon_winsys_cs *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs; + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); uint32_t *ptr; unsigned cnt = 0; unsigned size = 0; unsigned pad_words = 0; - if (preamble_cs) - size += preamble_cs->cdw; - while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) { - size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw; - ++cnt; - } + if (cs->num_old_cs_buffers > 0) { + /* Special path when the maximum size in dwords has + * been reached because we need to handle more than one + * IB per submit. + */ + unsigned new_cs_count = cs->num_old_cs_buffers + 1; + struct radeon_winsys_cs *new_cs_array[AMDGPU_CS_MAX_IBS_PER_SUBMIT]; + unsigned idx = 0; + + for (unsigned j = 0; j < cs->num_old_cs_buffers; j++) + new_cs_array[idx++] = &cs->old_cs_buffers[j]; + new_cs_array[idx++] = cs_array[i]; + + for (unsigned j = 0; j < new_cs_count; j++) { + struct radeon_winsys_cs *rcs = new_cs_array[j]; + bool needs_preamble = preamble_cs && j == 0; + unsigned size = 0; + + if (needs_preamble) + size += preamble_cs->cdw; + size += rcs->cdw; + + assert(size < 0xffff8); + + while (!size || (size & 7)) { + size++; + pad_words++; + } - while(!size || (size & 7)) { - size++; - pad_words++; - } - assert(cnt); + bos[j] = ws->buffer_create(ws, 4 * size, 4096, + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS | + RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_READ_ONLY); + ptr = ws->buffer_map(bos[j]); - bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS | - RADEON_FLAG_NO_INTERPROCESS_SHARING | - RADEON_FLAG_READ_ONLY); - ptr = ws->buffer_map(bo); + if (needs_preamble) { + memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4); + ptr += preamble_cs->cdw; + } - if (preamble_cs) { - memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4); - ptr += preamble_cs->cdw; - } + memcpy(ptr, rcs->buf, 4 * rcs->cdw); + ptr += rcs->cdw; - for (unsigned j = 0; j < cnt; ++j) { - struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]); - memcpy(ptr, cs->base.buf, 4 * cs->base.cdw); - ptr += cs->base.cdw; + for (unsigned k = 0; k < pad_words; ++k) + *ptr++ = pad_word; - } + ibs[j].size = size; + ibs[j].ib_mc_address = radv_buffer_get_va(bos[j]); + } - for (unsigned j = 0; j < pad_words; ++j) - *ptr++ = pad_word; + number_of_ibs = new_cs_count; + cnt++; + } else { + if (preamble_cs) + size += preamble_cs->cdw; - memset(&request, 0, sizeof(request)); + while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) { + size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw; + ++cnt; + } + + while (!size || (size & 7)) { + size++; + pad_words++; + } + assert(cnt); + bos[0] = ws->buffer_create(ws, 4 * size, 4096, + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS | + RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_READ_ONLY); + ptr = ws->buffer_map(bos[0]); + + if (preamble_cs) { + memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4); + ptr += preamble_cs->cdw; + } + + for (unsigned j = 0; j < cnt; ++j) { + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]); + memcpy(ptr, cs->base.buf, 4 * cs->base.cdw); + ptr += cs->base.cdw; + + } + + for (unsigned j = 0; j < pad_words; ++j) + *ptr++ = pad_word; + + ibs[0].size = size; + ibs[0].ib_mc_address = radv_buffer_get_va(bos[0]); + } r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, - (struct radv_amdgpu_winsys_bo*)bo, - preamble_cs, radv_bo_list, &bo_list); + (struct radv_amdgpu_winsys_bo **)bos, + number_of_ibs, preamble_cs, + radv_bo_list, &bo_list); if (r) { fprintf(stderr, "amdgpu: buffer list creation failed " "for the sysmem submission (%d)\n", r); return r; } - ib.size = size; - ib.ib_mc_address = radv_buffer_get_va(bo); + memset(&request, 0, sizeof(request)); request.ip_type = cs0->hw_ip; request.ring = queue_idx; request.resources = bo_list; - request.number_of_ibs = 1; - request.ibs = &ib; + request.number_of_ibs = number_of_ibs; + request.ibs = ibs; request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx); sem_info->cs_emit_signal = (i == cs_count - cnt) ? emit_signal_sem : false; @@ -1000,9 +1116,11 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx, if (bo_list) amdgpu_bo_list_destroy(bo_list); - ws->buffer_destroy(bo); - if (r) - return r; + for (unsigned j = 0; j < number_of_ibs; j++) { + ws->buffer_destroy(bos[j]); + if (r) + return r; + } i += cnt; } -- 2.30.2