From 373a204bdd7b6bd0d3bd1b052ef67824d1b81fa7 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 17 Aug 2020 14:27:57 -0400 Subject: [PATCH] panfrost: Pass alignments explicitly In most cases, GPU data structures need only be self-aligned; the worst-case 128 byte alignment is wasteful. By passing explicit alignments, we can reduce memory usage, avoid extra allocations, and improve descriptor cache locality. Signed-off-by: Alyssa Rosenzweig Reviewed-by: Tomeu Vizoso Part-of: --- src/gallium/drivers/panfrost/pan_cmdstream.c | 50 ++++++++++++-------- src/gallium/drivers/panfrost/pan_job.c | 6 +-- src/gallium/drivers/panfrost/pan_mfbd.c | 2 +- src/panfrost/lib/pan_blit.c | 7 ++- src/panfrost/lib/pan_pool.c | 13 ++--- src/panfrost/lib/pan_pool.h | 11 +++++ src/panfrost/lib/pan_scoreboard.c | 3 +- 7 files changed, 55 insertions(+), 37 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 699b48777cb..73c321cd8a3 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -217,9 +217,13 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, } else { /* Otherwise, we need to upload to transient memory */ const uint8_t *ibuf8 = (const uint8_t *) info->index.user; - out = panfrost_pool_upload(&batch->pool, ibuf8 + offset, - info->count * - info->index_size); + struct panfrost_transfer T = + panfrost_pool_alloc_aligned(&batch->pool, + info->count * info->index_size, + info->index_size); + + memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size); + out = T.gpu; } if (needs_indices) { @@ -814,7 +818,7 @@ panfrost_emit_shader_meta(struct panfrost_batch *batch, panfrost_frag_shader_meta_init(ctx, &meta, rts); - xfer = panfrost_pool_alloc(&batch->pool, desc_size); + xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta)); memcpy(xfer.cpu, &meta, sizeof(meta)); memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count); @@ -1106,8 +1110,8 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, size_t sys_size = sizeof(float) * 4 * ss->sysval_count; size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0; size_t size = sys_size + uniform_size; - struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool, - size); + struct panfrost_transfer transfer = + panfrost_pool_alloc_aligned(&batch->pool, size, 16); /* Upload sysvals requested by the shader */ panfrost_upload_sysvals(batch, transfer.cpu, ss, stage); @@ -1125,7 +1129,10 @@ panfrost_emit_const_buf(struct panfrost_batch *batch, assert(ubo_count >= 1); size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count; - struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz); + struct panfrost_transfer ubos = + panfrost_pool_alloc_aligned(&batch->pool, sz, + MALI_UNIFORM_BUFFER_LENGTH); + uint64_t *ubo_ptr = (uint64_t *) ubos.cpu; /* Upload uniforms as a UBO */ @@ -1244,9 +1251,10 @@ panfrost_emit_texture_descriptors(struct panfrost_batch *batch, return; if (device->quirks & IS_BIFROST) { - struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, + struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, MALI_BIFROST_TEXTURE_LENGTH * - ctx->sampler_view_count[stage]); + ctx->sampler_view_count[stage], + MALI_BIFROST_TEXTURE_LENGTH); struct mali_bifrost_texture_packed *out = (struct mali_bifrost_texture_packed *) T.cpu; @@ -1303,7 +1311,7 @@ panfrost_emit_sampler_descriptors(struct panfrost_batch *batch, assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH); size_t sz = desc_size * ctx->sampler_count[stage]; - struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, sz); + struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size); struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu; for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) @@ -1324,11 +1332,13 @@ panfrost_emit_vertex_data(struct panfrost_batch *batch, /* Worst case: everything is NPOT */ - struct panfrost_transfer S = panfrost_pool_alloc(&batch->pool, - MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2); + struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool, + MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2, + MALI_ATTRIBUTE_LENGTH); - struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, - MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1)); + struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, + MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1), + MALI_ATTRIBUTE_LENGTH); struct mali_attribute_buffer_packed *bufs = (struct mali_attribute_buffer_packed *) S.cpu; @@ -1496,7 +1506,7 @@ panfrost_emit_varyings(struct panfrost_batch *batch, unsigned stride, unsigned count) { unsigned size = stride * count; - mali_ptr ptr = panfrost_pool_alloc(&batch->invisible_pool, size).gpu; + mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu; pan_pack(slot, ATTRIBUTE_BUFFER, cfg) { cfg.stride = stride; @@ -1931,9 +1941,8 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count; fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count; - struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool, - vs_size + - fs_size); + struct panfrost_transfer trans = panfrost_pool_alloc_aligned( + &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH); struct pipe_stream_output_info *so = &vs->stream_output; unsigned present = pan_varying_present(vs, fs, dev->quirks); @@ -1979,8 +1988,9 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch, } unsigned xfb_base = pan_xfb_base(present); - struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, - MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets)); + struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, + MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets), + MALI_ATTRIBUTE_BUFFER_LENGTH); struct mali_attribute_buffer_packed *varyings = (struct mali_attribute_buffer_packed *) T.cpu; diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index c349a058270..a0f0f47940c 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -748,7 +748,7 @@ panfrost_batch_reserve_framebuffer(struct panfrost_batch *batch) sizeof(struct mali_single_framebuffer) : sizeof(struct mali_framebuffer); - batch->framebuffer = panfrost_pool_alloc(&batch->pool, size); + batch->framebuffer = panfrost_pool_alloc_aligned(&batch->pool, size, 64); /* Tag the pointer */ if (!(dev->quirks & MIDGARD_SFBD)) @@ -870,8 +870,8 @@ panfrost_load_surface(struct panfrost_batch *batch, struct pipe_surface *surf, u blend_shader = bo->gpu | b->first_tag; } - struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool, - 4 * 4 * 6 * rsrc->damage.inverted_len); + struct panfrost_transfer transfer = panfrost_pool_alloc_aligned(&batch->pool, + 4 * 4 * 6 * rsrc->damage.inverted_len, 64); for (unsigned i = 0; i < rsrc->damage.inverted_len; ++i) { float *o = (float *) (transfer.cpu + (4 * 4 * 6 * i)); diff --git a/src/gallium/drivers/panfrost/pan_mfbd.c b/src/gallium/drivers/panfrost/pan_mfbd.c index 4ae7320a992..deccd3a4239 100644 --- a/src/gallium/drivers/panfrost/pan_mfbd.c +++ b/src/gallium/drivers/panfrost/pan_mfbd.c @@ -395,7 +395,7 @@ panfrost_mfbd_upload(struct panfrost_batch *batch, sizeof(struct mali_render_target) * 8; struct panfrost_transfer m_f_trans = - panfrost_pool_alloc(&batch->pool, total_sz); + panfrost_pool_alloc_aligned(&batch->pool, total_sz, 64); /* Do the transfer */ diff --git a/src/panfrost/lib/pan_blit.c b/src/panfrost/lib/pan_blit.c index deec17ba720..7494b0a2731 100644 --- a/src/panfrost/lib/pan_blit.c +++ b/src/panfrost/lib/pan_blit.c @@ -290,7 +290,8 @@ panfrost_load_midg( * textures, removing the need to separately key the blit shaders for * 2D and 3D variants */ - struct panfrost_transfer texture_t = panfrost_pool_alloc(pool, MALI_MIDGARD_TEXTURE_LENGTH + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1)); + struct panfrost_transfer texture_t = panfrost_pool_alloc_aligned( + pool, MALI_MIDGARD_TEXTURE_LENGTH + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1), 128); panfrost_new_texture(texture_t.cpu, image->width0, image->height0, @@ -311,7 +312,9 @@ panfrost_load_midg( pan_pack(sampler.cpu, MIDGARD_SAMPLER, cfg) cfg.normalized_coordinates = false; - struct panfrost_transfer shader_meta_t = panfrost_pool_alloc(pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt)); + struct panfrost_transfer shader_meta_t = panfrost_pool_alloc_aligned( + pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt), 128); + memcpy(shader_meta_t.cpu, &shader_meta, sizeof(shader_meta)); for (unsigned i = 0; i < 8; ++i) { diff --git a/src/panfrost/lib/pan_pool.c b/src/panfrost/lib/pan_pool.c index 94847565f6b..3451e1b9da1 100644 --- a/src/panfrost/lib/pan_pool.c +++ b/src/panfrost/lib/pan_pool.c @@ -27,9 +27,6 @@ #include "pan_bo.h" #include "pan_pool.h" -/* TODO: What does this actually have to be? */ -#define ALIGNMENT 128 - /* Transient command stream pooling: command stream uploads try to simply copy * into whereever we left off. If there isn't space, we allocate a new entry * into the pool and copy there */ @@ -80,14 +77,11 @@ panfrost_create_pool(void *memctx, struct panfrost_device *dev, } struct panfrost_transfer -panfrost_pool_alloc(struct pan_pool *pool, size_t sz) +panfrost_pool_alloc_aligned(struct pan_pool *pool, size_t sz, unsigned alignment) { - /* Pad the size */ - sz = ALIGN_POT(sz, ALIGNMENT); - /* Find or create a suitable BO */ struct panfrost_bo *bo = pool->transient_bo; - unsigned offset = pool->transient_offset; + unsigned offset = ALIGN_POT(pool->transient_offset, alignment); /* If we don't fit, allocate a new backing */ if (unlikely(bo == NULL || (offset + sz) >= TRANSIENT_SLAB_SIZE)) { @@ -96,7 +90,7 @@ panfrost_pool_alloc(struct pan_pool *pool, size_t sz) offset = 0; } - pool->transient_offset += sz; + pool->transient_offset = offset + sz; struct panfrost_transfer ret = { .cpu = bo->cpu + offset, @@ -104,7 +98,6 @@ panfrost_pool_alloc(struct pan_pool *pool, size_t sz) }; return ret; - } mali_ptr diff --git a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h index 22fddb8870c..08612a37065 100644 --- a/src/panfrost/lib/pan_pool.h +++ b/src/panfrost/lib/pan_pool.h @@ -61,6 +61,17 @@ struct panfrost_transfer { mali_ptr gpu; }; +struct panfrost_transfer +panfrost_pool_alloc_aligned(struct pan_pool *pool, size_t sz, unsigned alignment); + +/* Default to self-alignment */ + +static inline struct panfrost_transfer +panfrost_pool_alloc(struct pan_pool *pool, size_t sz) +{ + return panfrost_pool_alloc_aligned(pool, sz, util_next_power_of_two(sz)); +} + struct panfrost_transfer panfrost_pool_alloc(struct pan_pool *pool, size_t sz); diff --git a/src/panfrost/lib/pan_scoreboard.c b/src/panfrost/lib/pan_scoreboard.c index 85ae14df266..f1ab2cac02c 100644 --- a/src/panfrost/lib/pan_scoreboard.c +++ b/src/panfrost/lib/pan_scoreboard.c @@ -145,7 +145,8 @@ panfrost_new_job( if (inject) job.next_job = scoreboard->first_job; - struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + payload_size); + struct panfrost_transfer transfer = + panfrost_pool_alloc_aligned(pool, sizeof(job) + payload_size, 64); memcpy(transfer.cpu, &job, sizeof(job)); memcpy(transfer.cpu + sizeof(job), payload, payload_size); -- 2.30.2