panfrost: Avoid minimum stack allocations
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Mon, 17 Aug 2020 16:30:49 +0000 (12:30 -0400)
committerTomeu Vizoso <tomeu.vizoso@collabora.com>
Thu, 20 Aug 2020 16:15:00 +0000 (18:15 +0200)
If stack isn't used, don't allocate it - simple as that.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6373>

src/gallium/drivers/panfrost/pan_cmdstream.c
src/gallium/drivers/panfrost/pan_job.c
src/gallium/drivers/panfrost/pan_job.h
src/gallium/drivers/panfrost/pan_mfbd.c
src/panfrost/lib/pan_encoder.h
src/panfrost/lib/pan_scratch.c

index 02e50cb3c78b8662fc3a98ebab12886f184b9bc6..bd01daccf91439111fd4a9e8fe83f9aadc0fe087 100644 (file)
@@ -58,12 +58,20 @@ panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
         struct panfrost_device *dev = pan_device(ctx->base.screen);
         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 
-        unsigned shift = panfrost_get_stack_shift(batch->stack_size);
         struct mali_shared_memory shared = {
-                .stack_shift = shift,
-                .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
                 .shared_workgroup_count = ~0,
         };
+
+        if (batch->stack_size) {
+                struct panfrost_bo *stack =
+                        panfrost_batch_get_scratchpad(batch, batch->stack_size,
+                                        dev->thread_tls_alloc,
+                                        dev->core_count);
+
+                shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
+                shared.scratchpad = stack->gpu;
+        }
+
         postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
 }
 
index a0160293f951a28208da37aeb583395a1a03bc20..b1c6805b6e651cd9cb2eb65b299678dd5683f85b 100644 (file)
@@ -639,11 +639,11 @@ panfrost_batch_get_polygon_list(struct panfrost_batch *batch, unsigned size)
 
 struct panfrost_bo *
 panfrost_batch_get_scratchpad(struct panfrost_batch *batch,
-                unsigned shift,
+                unsigned size_per_thread,
                 unsigned thread_tls_alloc,
                 unsigned core_count)
 {
-        unsigned size = panfrost_get_total_stack_size(shift,
+        unsigned size = panfrost_get_total_stack_size(size_per_thread,
                         thread_tls_alloc,
                         core_count);
 
index 00edd9574cb9699b6385775a4a8b8a71ce25572c..664d5da66836e088c01df5dbc1a0e534d38fd383 100644 (file)
@@ -174,7 +174,7 @@ void
 panfrost_batch_adjust_stack_size(struct panfrost_batch *batch);
 
 struct panfrost_bo *
-panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned shift, unsigned thread_tls_alloc, unsigned core_count);
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count);
 
 struct panfrost_bo *
 panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count);
index 6cbf2f219fe9484145230b7f4cc1aac34d47b2f3..4ae7320a992c71a640397d8ba9c94b267b06dd06 100644 (file)
@@ -498,13 +498,16 @@ panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count)
                 mfbd.msaa.sample_locations = panfrost_emit_sample_locations(batch);
                 mfbd.tiler_meta = panfrost_batch_get_tiler_meta(batch, vertex_count);
         } else {
-                unsigned shift = panfrost_get_stack_shift(batch->stack_size);
-                struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch,
-                                                                       shift,
-                                                                       dev->thread_tls_alloc,
-                                                                       dev->core_count);
-                mfbd.shared_memory.stack_shift = shift;
-                mfbd.shared_memory.scratchpad = bo->gpu;
+                if (batch->stack_size) {
+                        unsigned shift = panfrost_get_stack_shift(batch->stack_size);
+                        struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch,
+                                                                               batch->stack_size,
+                                                                               dev->thread_tls_alloc,
+                                                                               dev->core_count);
+                        mfbd.shared_memory.stack_shift = shift;
+                        mfbd.shared_memory.scratchpad = bo->gpu;
+                }
+
                 mfbd.shared_memory.shared_workgroup_count = ~0;
 
                 mfbd.tiler = panfrost_emit_midg_tiler(batch, vertex_count);
index a992a4c7427fca2acb1fea10d037b5ba59da20d2..0471701dbfb5524458478fd6e9a1a898d9c048a0 100644 (file)
@@ -74,7 +74,7 @@ panfrost_get_stack_shift(unsigned stack_size);
 
 unsigned
 panfrost_get_total_stack_size(
-                unsigned stack_shift,
+                unsigned thread_size,
                 unsigned threads_per_core,
                 unsigned core_count);
 
index 478a788b116d54a6cf77b961c62c4bcff9615ba5..47c98f3f7be5d1ce20d6a0edb3a9d0bff09b0188 100644 (file)
@@ -25,6 +25,7 @@
  */
 
 #include "util/u_math.h"
+#include "util/macros.h"
 #include "pan_encoder.h"
 
 /* Midgard has a small register file, so shaders with high register pressure
@@ -93,17 +94,16 @@ panfrost_get_stack_shift(unsigned stack_size)
                 return 0;
 }
 
-/* Computes the aligned stack size given the shift and thread count. The blob
- * reserves an extra page, and since this is hardware-internal, we do too. */
+/* Computes the aligned stack size given the shift and thread count. */
 
 unsigned
 panfrost_get_total_stack_size(
-                unsigned stack_shift,
+                unsigned thread_size,
                 unsigned threads_per_core,
                 unsigned core_count)
 {
-        unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32);
-        unsigned size = size_per_thread * threads_per_core * core_count;
+        unsigned size_per_thread = (thread_size == 0) ? 0 :
+                util_next_power_of_two(ALIGN_POT(thread_size, 16));
 
-        return size + 4096;
+        return size_per_thread * threads_per_core * core_count;
 }