panfrost: Identify mali_shared_memory structure
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Wed, 5 Feb 2020 20:58:28 +0000 (15:58 -0500)
committerAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Sun, 16 Feb 2020 14:16:46 +0000 (09:16 -0500)
This small structure is used to configure shared memory and stack for
compute shaders, and is also present at the beginning of framebuffer
descriptors. Let's factor it out.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3835>

src/gallium/drivers/panfrost/pan_compute.c
src/gallium/drivers/panfrost/pan_mfbd.c
src/gallium/drivers/panfrost/pan_sfbd.c
src/panfrost/include/panfrost-job.h
src/panfrost/pandecode/decode.c

index 56bac7a85231e4cd08664948d4d06de5aa6f7b84..33618cb6997bef26e9bda3cd0a53941f4104a1c4 100644 (file)
@@ -127,20 +127,12 @@ panfrost_launch_grid(struct pipe_context *pipe,
 
         panfrost_emit_for_draw(ctx, false);
 
-        /* Compute jobs have a "compute FBD". It's not a real framebuffer
-         * descriptor - there is no framebuffer - but it takes the place of
-         * one. As far as I can tell, it's actually the beginning of a
-         * single-render-target framebuffer descriptor with almost everything
-         * zeroed out.
-         */
-        struct mali_compute_fbd compute_fbd = {
-                .unknown1 = {
-                        0, 0x1F, 0, 0, 0, 0, 0, 0
-                }
+        struct mali_shared_memory shared = {
+                .shared_workgroup_count = ~0
         };
 
         payload->postfix.framebuffer =
-                panfrost_upload_transient(batch, &compute_fbd, sizeof(compute_fbd));
+                panfrost_upload_transient(batch, &shared, sizeof(shared));
 
         /* Invoke according to the grid info */
 
index fe427c452b01d12ba525cf6c59c35a2d3f6d1cfb..3e0f5cbd2751899cbf991d76dde2b09800bb551e 100644 (file)
@@ -380,12 +380,14 @@ panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count)
                 .rt_count_1 = MALI_POSITIVE(batch->key.nr_cbufs),
                 .rt_count_2 = 4,
 
-                .unknown2 = 0x1f,
                 .tiler = panfrost_emit_midg_tiler(batch, vertex_count),
-                
-                .stack_shift = shift,
-                .unk0 = 0x1e,
-                .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu
+
+                .shared_memory = {
+                        .unk0 = 0x1e,
+                        .stack_shift = shift,
+                        .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu,
+                        .shared_workgroup_count = ~0,
+                }
         };
 
         return framebuffer;
index 97d00651076b5867b62ee8d024ec358eb0145eb4..a4d29adc4602a32d1b20f0f429029b98c16c02e8 100644 (file)
@@ -213,12 +213,14 @@ panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count)
         struct mali_single_framebuffer framebuffer = {
                 .width = MALI_POSITIVE(width),
                 .height = MALI_POSITIVE(height),
-                .unknown2 = 0x1f,
+                .shared_memory = {
+                        .shared_workgroup_count = ~0,
+                        .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu,
+                },
                 .format = {
                         .unk3 = 0x3,
                 },
                 .clear_flags = 0x1000,
-                .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu,
                 .tiler = panfrost_emit_midg_tiler(batch, vertex_count),
         };
 
index 1a59f4c77c838b3b52ad400ae986ec25bc1afeb0..8027abebf347f2928ad907082b2db9724272405f 100644 (file)
@@ -1470,14 +1470,41 @@ struct mali_sfbd_format {
         unsigned unk3 : 4;
 };
 
-struct mali_single_framebuffer {
-        u32 unknown1;
-        u32 unknown2;
+/* Shared structure at the start of framebuffer descriptors, or used bare for
+ * compute jobs, configuring stack and shared memory */
+
+struct mali_shared_memory {
+        u32 stack_shift : 4;
+        u32 unk0 : 28;
+
+        /* Configuration for shared memory for compute shaders.
+         * shared_workgroup_count is logarithmic and may be computed for a
+         * compute shader using shared memory as:
+         *
+         *  shared_workgroup_count = MAX2(ceil(log2(count_x)) + ... + ceil(log2(count_z), 10)
+         *
+         * For compute shaders that don't use shared memory, or non-compute
+         * shaders, this is set to ~0
+         */
+
+        u32 shared_workgroup_count : 5;
+        u32 shared_unk1 : 3;
+        u32 shared_shift : 4;
+        u32 shared_zero : 20;
+
         mali_ptr scratchpad;
 
-        u64 zero1;
-        u64 zero0;
+        /* For compute shaders, the RAM backing of workgroup-shared memory. For
+         * fragment shaders on Bifrost, apparently multisampling locations */
+
+        mali_ptr shared_memory;
+        mali_ptr unknown1;
+} __attribute__((packed));
+
+
 
+struct mali_single_framebuffer {
+        struct mali_shared_memory shared_memory;
         struct mali_sfbd_format format;
 
         u32 clear_flags;
@@ -1540,13 +1567,6 @@ struct mali_single_framebuffer {
         /* More below this, maybe */
 } __attribute__((packed));
 
-/* On Midgard, this "framebuffer descriptor" is used for the framebuffer field
- * of compute jobs. Superficially resembles a single framebuffer descriptor */
-
-struct mali_compute_fbd {
-        u32 unknown1[8];
-} __attribute__((packed));
-
 /* Format bits for the render target flags */
 
 #define MALI_MFBD_FORMAT_MSAA    (1 << 1)
@@ -1675,15 +1695,8 @@ struct bifrost_fb_extra {
 #define MALI_MFBD_EXTRA (1 << 13)
 
 struct bifrost_framebuffer {
-        u32 stack_shift : 4;
-        u32 unk0 : 28;
+        struct mali_shared_memory shared_memory;
 
-        u32 unknown2; // = 0x1f, same as SFBD
-        mali_ptr scratchpad;
-
-        /* 0x10 */
-        mali_ptr sample_locations;
-        mali_ptr unknown1;
         /* 0x20 */
         u16 width1, height1;
         u32 zero3;
index dc755fcc3640b90fa534da87d5fb658cf686e53e..41b843f7a3b2e2c7669cc51e83cbb5a42295493a 100644 (file)
@@ -666,6 +666,41 @@ pandecode_sfbd_format(struct mali_sfbd_format format)
         pandecode_log("},\n");
 }
 
+static void
+pandecode_shared_memory(const struct mali_shared_memory *desc, bool is_compute)
+{
+        pandecode_prop("stack_shift = 0x%x", desc->stack_shift);
+
+        if (desc->unk0)
+                pandecode_prop("unk0 = 0x%x", desc->unk0);
+
+        if (desc->shared_workgroup_count != 0x1F) {
+                pandecode_prop("shared_workgroup_count = %d", desc->shared_workgroup_count);
+                if (!is_compute)
+                        pandecode_msg("XXX: wrong workgroup count for noncompute\n");
+        }
+
+        if (desc->shared_unk1 || desc->shared_shift) {
+                pandecode_prop("shared_unk1 = %X", desc->shared_unk1);
+                pandecode_prop("shared_shift = %X", desc->shared_shift);
+
+                if (!is_compute)
+                        pandecode_msg("XXX: shared memory configured in noncompute shader");
+        }
+
+        if (desc->shared_zero) {
+                pandecode_msg("XXX: shared memory zero tripped\n");
+                pandecode_prop("shared_zero = 0x%" PRIx32, desc->shared_zero);
+        }
+
+        if (desc->shared_memory && !is_compute)
+                pandecode_msg("XXX: shared memory used in noncompute shader\n");
+
+        MEMORY_PROP(desc, scratchpad);
+        MEMORY_PROP(desc, shared_memory);
+        MEMORY_PROP(desc, unknown1);
+}
+
 static struct pandecode_fbd
 pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id)
 {
@@ -680,8 +715,11 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id)
         pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
         pandecode_indent++;
 
-        pandecode_prop("unknown1 = 0x%" PRIx32, s->unknown1);
-        pandecode_prop("unknown2 = 0x%" PRIx32, s->unknown2);
+        pandecode_log(".shared_memory = {\n");
+        pandecode_indent++;
+        pandecode_shared_memory(&s->shared_memory, false);
+        pandecode_indent--;
+        pandecode_log("},\n");
 
         pandecode_sfbd_format(s->format);
 
@@ -748,7 +786,6 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id)
                 pandecode_prop("clear_stencil = 0x%x", s->clear_stencil);
         }
 
-        MEMORY_PROP(s, scratchpad);
         const struct midgard_tiler_descriptor t = s->tiler;
 
         bool has_hierarchy = !(gpu_id == 0x0720 || gpu_id == 0x0820 || gpu_id == 0x0830);
@@ -757,8 +794,6 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id)
         pandecode_indent--;
         pandecode_log("};\n");
 
-        pandecode_prop("zero0 = 0x%" PRIx64, s->zero0);
-        pandecode_prop("zero1 = 0x%" PRIx64, s->zero1);
         pandecode_prop("zero2 = 0x%" PRIx32, s->zero2);
         pandecode_prop("zero4 = 0x%" PRIx32, s->zero4);
         pandecode_prop("zero5 = 0x%" PRIx32, s->zero5);
@@ -784,20 +819,13 @@ static void
 pandecode_compute_fbd(uint64_t gpu_va, int job_no)
 {
         struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
-        const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
+        const struct mali_shared_memory *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
 
-        pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
+        pandecode_log("struct mali_shared_memory shared_%"PRIx64"_%d = {\n", gpu_va, job_no);
         pandecode_indent++;
-
-        pandecode_log(".unknown1 = {");
-
-        for (int i = 0; i < ARRAY_SIZE(s->unknown1); ++i)
-                pandecode_log_cont("%X, ", s->unknown1[i]);
-
-        pandecode_log("},\n");
-
+        pandecode_shared_memory(s, true);
         pandecode_indent--;
-        pandecode_log_cont("},\n");
+        pandecode_log("},\n");
 }
 
 /* Extracts the number of components associated with a Mali format */
@@ -1034,45 +1062,14 @@ pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment, bool is_comput
 
         struct pandecode_fbd info;
  
-        if (fb->sample_locations) {
-                /* The blob stores all possible sample locations in a single buffer
-                 * allocated on startup, and just switches the pointer when switching
-                 * MSAA state. For now, we just put the data into the cmdstream, but we
-                 * should do something like what the blob does with a real driver.
-                 *
-                 * There seem to be 32 slots for sample locations, followed by another
-                 * 16. The second 16 is just the center location followed by 15 zeros
-                 * in all the cases I've identified (maybe shader vs. depth/color
-                 * samples?).
-                 */
-
-                struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(fb->sample_locations);
-
-                const u16 *PANDECODE_PTR_VAR(samples, smem, fb->sample_locations);
-
-                pandecode_log("uint16_t sample_locations_%d[] = {\n", job_no);
-                pandecode_indent++;
-
-                for (int i = 0; i < 32 + 16; i++) {
-                        pandecode_log("%d, %d,\n", samples[2 * i], samples[2 * i + 1]);
-                }
-
-                pandecode_indent--;
-                pandecode_log("};\n");
-        }
-
         pandecode_log("struct bifrost_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
         pandecode_indent++;
 
-        pandecode_prop("stack_shift = 0x%x", fb->stack_shift);
-        pandecode_prop("unk0 = 0x%x", fb->unk0);
-
-        if (fb->sample_locations)
-                pandecode_prop("sample_locations = sample_locations_%d", job_no);
-
-        /* Assume that unknown1 was emitted in the last job for
-         * now */
-        MEMORY_PROP(fb, unknown1);
+        pandecode_log(".shared_memory = {\n");
+        pandecode_indent++;
+        pandecode_shared_memory(&fb->shared_memory, is_compute);
+        pandecode_indent--;
+        pandecode_log("},\n");
 
         info.width = fb->width1 + 1;
         info.height = fb->height1 + 1;
@@ -1098,12 +1095,6 @@ pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment, bool is_comput
         if (fb->clear_depth)
                 pandecode_prop("clear_depth = %f", fb->clear_depth);
 
-        /* TODO: What is this? Let's not blow up.. */
-        if (fb->unknown2 != 0x1F)
-                pandecode_prop("unknown2 = 0x%x", fb->unknown2);
-
-        pandecode_prop("unknown2 = 0x%x", fb->unknown2);
-        MEMORY_PROP(fb, scratchpad);
         const struct midgard_tiler_descriptor t = fb->tiler;
         if (!is_compute)
                 pandecode_midgard_tiler_descriptor(&t, fb->width1 + 1, fb->height1 + 1, is_fragment, true);