If stack isn't used, don't allocate it - simple as that.
Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6373>
struct panfrost_device *dev = pan_device(ctx->base.screen);
struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
- unsigned shift = panfrost_get_stack_shift(batch->stack_size);
struct mali_shared_memory shared = {
- .stack_shift = shift,
- .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
.shared_workgroup_count = ~0,
};
+
+ if (batch->stack_size) {
+ struct panfrost_bo *stack =
+ panfrost_batch_get_scratchpad(batch, batch->stack_size,
+ dev->thread_tls_alloc,
+ dev->core_count);
+
+ shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
+ shared.scratchpad = stack->gpu;
+ }
+
postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
}
struct panfrost_bo *
panfrost_batch_get_scratchpad(struct panfrost_batch *batch,
- unsigned shift,
+ unsigned size_per_thread,
unsigned thread_tls_alloc,
unsigned core_count)
{
- unsigned size = panfrost_get_total_stack_size(shift,
+ unsigned size = panfrost_get_total_stack_size(size_per_thread,
thread_tls_alloc,
core_count);
panfrost_batch_adjust_stack_size(struct panfrost_batch *batch);
struct panfrost_bo *
-panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned shift, unsigned thread_tls_alloc, unsigned core_count);
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count);
struct panfrost_bo *
panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count);
mfbd.msaa.sample_locations = panfrost_emit_sample_locations(batch);
mfbd.tiler_meta = panfrost_batch_get_tiler_meta(batch, vertex_count);
} else {
- unsigned shift = panfrost_get_stack_shift(batch->stack_size);
- struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch,
- shift,
- dev->thread_tls_alloc,
- dev->core_count);
- mfbd.shared_memory.stack_shift = shift;
- mfbd.shared_memory.scratchpad = bo->gpu;
+ if (batch->stack_size) {
+ unsigned shift = panfrost_get_stack_shift(batch->stack_size);
+ struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch,
+ batch->stack_size,
+ dev->thread_tls_alloc,
+ dev->core_count);
+ mfbd.shared_memory.stack_shift = shift;
+ mfbd.shared_memory.scratchpad = bo->gpu;
+ }
+
mfbd.shared_memory.shared_workgroup_count = ~0;
mfbd.tiler = panfrost_emit_midg_tiler(batch, vertex_count);
unsigned
panfrost_get_total_stack_size(
- unsigned stack_shift,
+ unsigned thread_size,
unsigned threads_per_core,
unsigned core_count);
*/
#include "util/u_math.h"
+#include "util/macros.h"
#include "pan_encoder.h"
/* Midgard has a small register file, so shaders with high register pressure
return 0;
}
-/* Computes the aligned stack size given the shift and thread count. The blob
- * reserves an extra page, and since this is hardware-internal, we do too. */
+/* Computes the aligned stack size given the shift and thread count. */
unsigned
panfrost_get_total_stack_size(
- unsigned stack_shift,
+ unsigned thread_size,
unsigned threads_per_core,
unsigned core_count)
{
- unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32);
- unsigned size = size_per_thread * threads_per_core * core_count;
+ unsigned size_per_thread = (thread_size == 0) ? 0 :
+ util_next_power_of_two(ALIGN_POT(thread_size, 16));
- return size + 4096;
+ return size_per_thread * threads_per_core * core_count;
}