From 83a01cb4983fd4b8ee8402a0679bead2bc0094af Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nicolai=20H=C3=A4hnle?= Date: Sat, 7 May 2016 10:58:13 -0500 Subject: [PATCH] winsys/amdgpu: start with smaller IBs, growing as necessary MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This avoids allocating giant IBs from the outset, especially for CE and DMA. Since we now limit max_dw only by the size that the buffer happens to be (which, due to the buffer cache, can be even larger than the rounded-up size we request), the new function amdgpu_ib_max_submit_dwords controls when we submit an IB. With this change, we effectively never flush prematurely due to the CE IB, after an initial warm-up phase. v2: - clean up buffer_size calculation Reviewed-by: Marek Olšák --- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 80 ++++++++++++++++++++--- src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 1 + 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index f070307e25e..781960c9600 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -336,11 +336,33 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, return index; } -static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib, - unsigned buffer_size) +static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) { struct pb_buffer *pb; uint8_t *mapped; + unsigned buffer_size; + + /* Always create a buffer that is 4 times larger than the maximum seen IB + * size, aligned to a power of two. Limit to 512k dwords, which is the + * largest power of two that fits into the size field of the INDIRECT_BUFFER + * packet. + */ + buffer_size = 4 * MIN2(util_next_power_of_two(4 * ib->max_ib_size), + 512 * 1024); + + switch (ib->ib_type) { + case IB_CONST_PREAMBLE: + buffer_size = MAX2(buffer_size, 4 * 1024); + break; + case IB_CONST: + buffer_size = MAX2(buffer_size, 16 * 1024 * 4); + break; + case IB_MAIN: + buffer_size = MAX2(buffer_size, 8 * 1024 * 4); + break; + default: + unreachable("unhandled IB type"); + } pb = ws->base.buffer_create(&ws->base, buffer_size, ws->info.gart_page_size, @@ -364,6 +386,27 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib, return true; } +static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type) +{ + switch (ib_type) { + case IB_MAIN: + /* Smaller submits means the GPU gets busy sooner and there is less + * waiting for buffers and fences. Proof: + * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 + */ + return 20 * 1024; + case IB_CONST_PREAMBLE: + case IB_CONST: + /* There isn't really any reason to limit CE IB size beyond the natural + * limit implied by the main IB, except perhaps GTT size. Just return + * an extremely large value that we never get anywhere close to. + */ + return 16 * 1024 * 1024; + default: + unreachable("bad ib_type"); + } +} + static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, enum ib_type ib_type) { @@ -374,35 +417,36 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, */ struct amdgpu_ib *ib = NULL; struct amdgpu_cs_ib_info *info = &cs->csc->ib[ib_type]; - unsigned buffer_size, ib_size; + unsigned ib_size = 0; switch (ib_type) { case IB_CONST_PREAMBLE: ib = &cs->const_preamble_ib; - buffer_size = 4 * 1024 * 4; - ib_size = 1024 * 4; + ib_size = 256 * 4; break; case IB_CONST: ib = &cs->const_ib; - buffer_size = 512 * 1024 * 4; - ib_size = 128 * 1024 * 4; + ib_size = 8 * 1024 * 4; break; case IB_MAIN: ib = &cs->main; - buffer_size = 128 * 1024 * 4; - ib_size = 20 * 1024 * 4; + ib_size = 4 * 1024 * 4; break; default: unreachable("unhandled IB type"); } + ib_size = MAX2(ib_size, + 4 * MIN2(util_next_power_of_two(ib->max_ib_size), + amdgpu_ib_max_submit_dwords(ib_type))); + ib->base.cdw = 0; ib->base.buf = NULL; /* Allocate a new buffer for IBs if the current buffer is all used. */ if (!ib->big_ib_buffer || ib->used_ib_space + ib_size > ib->big_ib_buffer->size) { - if (!amdgpu_ib_new_buffer(aws, ib, buffer_size)) + if (!amdgpu_ib_new_buffer(aws, ib)) return false; } @@ -412,6 +456,8 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, RADEON_USAGE_READ, 0, RADEON_PRIO_IB1); ib->base.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space); + + ib_size = ib->big_ib_buffer->size - ib->used_ib_space; ib->base.max_dw = ib_size / 4; return true; } @@ -624,7 +670,17 @@ static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs) static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw) { + struct amdgpu_ib *ib = amdgpu_ib(rcs); + struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib); + unsigned requested_size = rcs->cdw + dw; + assert(rcs->cdw <= rcs->max_dw); + + if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type)) + return false; + + ib->max_ib_size = MAX2(ib->max_ib_size, requested_size); + return rcs->max_dw - rcs->cdw >= dw; } @@ -861,15 +917,19 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs, /* Set IB sizes. */ cur->ib[IB_MAIN].size = cs->main.base.cdw; cs->main.used_ib_space += cs->main.base.cdw * 4; + cs->main.max_ib_size = MAX2(cs->main.max_ib_size, cs->main.base.cdw); if (cs->const_ib.ib_mapped) { cur->ib[IB_CONST].size = cs->const_ib.base.cdw; cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4; + cs->const_ib.max_ib_size = MAX2(cs->const_ib.max_ib_size, cs->const_ib.base.cdw); } if (cs->const_preamble_ib.ib_mapped) { cur->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw; cs->const_preamble_ib.used_ib_space += cs->const_preamble_ib.base.cdw * 4; + cs->const_preamble_ib.max_ib_size = + MAX2(cs->const_preamble_ib.max_ib_size, cs->const_preamble_ib.base.cdw); } /* Create a fence. */ diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 25bad07af3e..62811e9aa10 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -64,6 +64,7 @@ struct amdgpu_ib { struct pb_buffer *big_ib_buffer; uint8_t *ib_mapped; unsigned used_ib_space; + unsigned max_ib_size; enum ib_type ib_type; }; -- 2.30.2