panfrost: Introduce invisible pool
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Mon, 17 Aug 2020 14:31:02 +0000 (10:31 -0400)
committerTomeu Vizoso <tomeu.vizoso@collabora.com>
Thu, 20 Aug 2020 16:15:00 +0000 (18:15 +0200)
Whereas the main batch->pool is CPU read/write, the new
batch->invisible_pool is not. This enables GPU-internal structures that
the CPU must allocate from a pool dynamically but does not read,
corresponding to the BO_INVISIBLE create flag.

The use case is speeding up varying allocation by skipping the
CPU-side mmap/munmap.

We simultaneously half the pool's minimal allocation to avoid negatively
affecting memory usage.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6373>

src/gallium/drivers/panfrost/pan_cmdstream.c
src/gallium/drivers/panfrost/pan_job.c
src/gallium/drivers/panfrost/pan_job.h
src/panfrost/lib/pan_device.h

index 90ff477ec6625899434482653dd6c7b9b51b864b..02e50cb3c78b8662fc3a98ebab12886f184b9bc6 100644 (file)
@@ -1492,7 +1492,7 @@ panfrost_emit_varyings(struct panfrost_batch *batch,
                 unsigned stride, unsigned count)
 {
         unsigned size = stride * count;
-        mali_ptr ptr = panfrost_pool_alloc(&batch->pool, size).gpu;
+        mali_ptr ptr = panfrost_pool_alloc(&batch->invisible_pool, size).gpu;
 
         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
                 cfg.stride = stride;
index 50616eb90172c75115f55e06e34c474d9d0cc481..a0160293f951a28208da37aeb583395a1a03bc20 100644 (file)
@@ -100,6 +100,7 @@ panfrost_create_batch(struct panfrost_context *ctx,
                       const struct pipe_framebuffer_state *key)
 {
         struct panfrost_batch *batch = rzalloc(ctx, struct panfrost_batch);
+        struct panfrost_device *dev = pan_device(ctx->base.screen);
 
         batch->ctx = ctx;
 
@@ -112,7 +113,15 @@ panfrost_create_batch(struct panfrost_context *ctx,
         batch->out_sync = panfrost_create_batch_fence(batch);
         util_copy_framebuffer_state(&batch->key, key);
 
-        batch->pool = panfrost_create_pool(batch, pan_device(ctx->base.screen), 0, true);
+        /* Preallocate the main pool, since every batch has at least one job
+         * structure so it will be used */
+        batch->pool = panfrost_create_pool(batch, dev, 0, true);
+
+        /* Don't preallocate the invisible pool, since not every batch will use
+         * the pre-allocation, particularly if the varyings are larger than the
+         * preallocation and a reallocation is needed after anyway. */
+        batch->invisible_pool =
+                panfrost_create_pool(batch, dev, PAN_BO_INVISIBLE, false);
 
         panfrost_batch_add_fbo_bos(batch);
 
@@ -170,6 +179,9 @@ panfrost_free_batch(struct panfrost_batch *batch)
         hash_table_foreach(batch->pool.bos, entry)
                 panfrost_bo_unreference((struct panfrost_bo *)entry->key);
 
+        hash_table_foreach(batch->invisible_pool.bos, entry)
+                panfrost_bo_unreference((struct panfrost_bo *)entry->key);
+
         util_dynarray_foreach(&batch->dependencies,
                               struct panfrost_batch_fence *, dep) {
                 panfrost_batch_fence_unreference(*dep);
@@ -985,7 +997,7 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
         submit.jc = first_job_desc;
         submit.requirements = reqs;
 
-        bo_handles = calloc(batch->pool.bos->entries + batch->bos->entries, sizeof(*bo_handles));
+        bo_handles = calloc(batch->pool.bos->entries + batch->invisible_pool.bos->entries + batch->bos->entries, sizeof(*bo_handles));
         assert(bo_handles);
 
         hash_table_foreach(batch->bos, entry)
@@ -994,6 +1006,9 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
         hash_table_foreach(batch->pool.bos, entry)
                 panfrost_batch_record_bo(entry, bo_handles, submit.bo_handle_count++);
 
+        hash_table_foreach(batch->invisible_pool.bos, entry)
+                panfrost_batch_record_bo(entry, bo_handles, submit.bo_handle_count++);
+
         submit.bo_handles = (u64) (uintptr_t) bo_handles;
         ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit);
         free(bo_handles);
index e94dd76ad0c007a46aee6c8c68f62311af1a5e99..00edd9574cb9699b6385775a4a8b8a71ce25572c 100644 (file)
@@ -95,6 +95,11 @@ struct panfrost_batch {
         /* Pool owned by this batch (released when the batch is released) used for temporary descriptors */
         struct pan_pool pool;
 
+        /* Pool also owned by this batch that is not CPU mapped (created as
+         * INVISIBLE) used for private GPU-internal structures, particularly
+         * varyings */
+        struct pan_pool invisible_pool;
+
         /* Job scoreboarding state */
         struct pan_scoreboard scoreboard;
 
index b1996d0a6451d41a206eea05d0cc038f84c491a0..29eb599e1bf89d47ef037529372b23d5e93c3f5a 100644 (file)
@@ -45,7 +45,7 @@
 /* Transient slab size. This is a balance between fragmentation against cache
  * locality and ease of bookkeeping */
 
-#define TRANSIENT_SLAB_PAGES (32) /* 128kb */
+#define TRANSIENT_SLAB_PAGES (16) /* 64kb */
 #define TRANSIENT_SLAB_SIZE (4096 * TRANSIENT_SLAB_PAGES)
 
 /* Maximum number of transient slabs so we don't need dynamic arrays. Most