From b41692caf427fc2335121e762bb3a539ef7506cb Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 17 Aug 2020 12:30:49 -0400 Subject: [PATCH] panfrost: Avoid minimum stack allocations If stack isn't used, don't allocate it - simple as that. Signed-off-by: Alyssa Rosenzweig Reviewed-by: Tomeu Vizoso Part-of: --- src/gallium/drivers/panfrost/pan_cmdstream.c | 14 +++++++++++--- src/gallium/drivers/panfrost/pan_job.c | 4 ++-- src/gallium/drivers/panfrost/pan_job.h | 2 +- src/gallium/drivers/panfrost/pan_mfbd.c | 17 ++++++++++------- src/panfrost/lib/pan_encoder.h | 2 +- src/panfrost/lib/pan_scratch.c | 12 ++++++------ 6 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 02e50cb3c78..bd01daccf91 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -58,12 +58,20 @@ panfrost_vt_emit_shared_memory(struct panfrost_context *ctx, struct panfrost_device *dev = pan_device(ctx->base.screen); struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); - unsigned shift = panfrost_get_stack_shift(batch->stack_size); struct mali_shared_memory shared = { - .stack_shift = shift, - .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu, .shared_workgroup_count = ~0, }; + + if (batch->stack_size) { + struct panfrost_bo *stack = + panfrost_batch_get_scratchpad(batch, batch->stack_size, + dev->thread_tls_alloc, + dev->core_count); + + shared.stack_shift = panfrost_get_stack_shift(batch->stack_size); + shared.scratchpad = stack->gpu; + } + postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared)); } diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index a0160293f95..b1c6805b6e6 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -639,11 +639,11 @@ panfrost_batch_get_polygon_list(struct panfrost_batch *batch, unsigned size) struct panfrost_bo * panfrost_batch_get_scratchpad(struct panfrost_batch *batch, - unsigned shift, + unsigned size_per_thread, unsigned thread_tls_alloc, unsigned core_count) { - unsigned size = panfrost_get_total_stack_size(shift, + unsigned size = panfrost_get_total_stack_size(size_per_thread, thread_tls_alloc, core_count); diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h index 00edd9574cb..664d5da6683 100644 --- a/src/gallium/drivers/panfrost/pan_job.h +++ b/src/gallium/drivers/panfrost/pan_job.h @@ -174,7 +174,7 @@ void panfrost_batch_adjust_stack_size(struct panfrost_batch *batch); struct panfrost_bo * -panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned shift, unsigned thread_tls_alloc, unsigned core_count); +panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count); struct panfrost_bo * panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count); diff --git a/src/gallium/drivers/panfrost/pan_mfbd.c b/src/gallium/drivers/panfrost/pan_mfbd.c index 6cbf2f219fe..4ae7320a992 100644 --- a/src/gallium/drivers/panfrost/pan_mfbd.c +++ b/src/gallium/drivers/panfrost/pan_mfbd.c @@ -498,13 +498,16 @@ panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count) mfbd.msaa.sample_locations = panfrost_emit_sample_locations(batch); mfbd.tiler_meta = panfrost_batch_get_tiler_meta(batch, vertex_count); } else { - unsigned shift = panfrost_get_stack_shift(batch->stack_size); - struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch, - shift, - dev->thread_tls_alloc, - dev->core_count); - mfbd.shared_memory.stack_shift = shift; - mfbd.shared_memory.scratchpad = bo->gpu; + if (batch->stack_size) { + unsigned shift = panfrost_get_stack_shift(batch->stack_size); + struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch, + batch->stack_size, + dev->thread_tls_alloc, + dev->core_count); + mfbd.shared_memory.stack_shift = shift; + mfbd.shared_memory.scratchpad = bo->gpu; + } + mfbd.shared_memory.shared_workgroup_count = ~0; mfbd.tiler = panfrost_emit_midg_tiler(batch, vertex_count); diff --git a/src/panfrost/lib/pan_encoder.h b/src/panfrost/lib/pan_encoder.h index a992a4c7427..0471701dbfb 100644 --- a/src/panfrost/lib/pan_encoder.h +++ b/src/panfrost/lib/pan_encoder.h @@ -74,7 +74,7 @@ panfrost_get_stack_shift(unsigned stack_size); unsigned panfrost_get_total_stack_size( - unsigned stack_shift, + unsigned thread_size, unsigned threads_per_core, unsigned core_count); diff --git a/src/panfrost/lib/pan_scratch.c b/src/panfrost/lib/pan_scratch.c index 478a788b116..47c98f3f7be 100644 --- a/src/panfrost/lib/pan_scratch.c +++ b/src/panfrost/lib/pan_scratch.c @@ -25,6 +25,7 @@ */ #include "util/u_math.h" +#include "util/macros.h" #include "pan_encoder.h" /* Midgard has a small register file, so shaders with high register pressure @@ -93,17 +94,16 @@ panfrost_get_stack_shift(unsigned stack_size) return 0; } -/* Computes the aligned stack size given the shift and thread count. The blob - * reserves an extra page, and since this is hardware-internal, we do too. */ +/* Computes the aligned stack size given the shift and thread count. */ unsigned panfrost_get_total_stack_size( - unsigned stack_shift, + unsigned thread_size, unsigned threads_per_core, unsigned core_count) { - unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32); - unsigned size = size_per_thread * threads_per_core * core_count; + unsigned size_per_thread = (thread_size == 0) ? 0 : + util_next_power_of_two(ALIGN_POT(thread_size, 16)); - return size + 4096; + return size_per_thread * threads_per_core * core_count; } -- 2.30.2