From 254f40fd535ef57dee2bcc4afd97840749ce5918 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 5 Feb 2020 15:58:28 -0500 Subject: [PATCH] panfrost: Identify mali_shared_memory structure This small structure is used to configure shared memory and stack for compute shaders, and is also present at the beginning of framebuffer descriptors. Let's factor it out. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/gallium/drivers/panfrost/pan_compute.c | 14 +-- src/gallium/drivers/panfrost/pan_mfbd.c | 12 ++- src/gallium/drivers/panfrost/pan_sfbd.c | 6 +- src/panfrost/include/panfrost-job.h | 53 ++++++---- src/panfrost/pandecode/decode.c | 107 ++++++++++----------- 5 files changed, 96 insertions(+), 96 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_compute.c b/src/gallium/drivers/panfrost/pan_compute.c index 56bac7a8523..33618cb6997 100644 --- a/src/gallium/drivers/panfrost/pan_compute.c +++ b/src/gallium/drivers/panfrost/pan_compute.c @@ -127,20 +127,12 @@ panfrost_launch_grid(struct pipe_context *pipe, panfrost_emit_for_draw(ctx, false); - /* Compute jobs have a "compute FBD". It's not a real framebuffer - * descriptor - there is no framebuffer - but it takes the place of - * one. As far as I can tell, it's actually the beginning of a - * single-render-target framebuffer descriptor with almost everything - * zeroed out. - */ - struct mali_compute_fbd compute_fbd = { - .unknown1 = { - 0, 0x1F, 0, 0, 0, 0, 0, 0 - } + struct mali_shared_memory shared = { + .shared_workgroup_count = ~0 }; payload->postfix.framebuffer = - panfrost_upload_transient(batch, &compute_fbd, sizeof(compute_fbd)); + panfrost_upload_transient(batch, &shared, sizeof(shared)); /* Invoke according to the grid info */ diff --git a/src/gallium/drivers/panfrost/pan_mfbd.c b/src/gallium/drivers/panfrost/pan_mfbd.c index fe427c452b0..3e0f5cbd275 100644 --- a/src/gallium/drivers/panfrost/pan_mfbd.c +++ b/src/gallium/drivers/panfrost/pan_mfbd.c @@ -380,12 +380,14 @@ panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count) .rt_count_1 = MALI_POSITIVE(batch->key.nr_cbufs), .rt_count_2 = 4, - .unknown2 = 0x1f, .tiler = panfrost_emit_midg_tiler(batch, vertex_count), - - .stack_shift = shift, - .unk0 = 0x1e, - .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu + + .shared_memory = { + .unk0 = 0x1e, + .stack_shift = shift, + .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu, + .shared_workgroup_count = ~0, + } }; return framebuffer; diff --git a/src/gallium/drivers/panfrost/pan_sfbd.c b/src/gallium/drivers/panfrost/pan_sfbd.c index 97d00651076..a4d29adc460 100644 --- a/src/gallium/drivers/panfrost/pan_sfbd.c +++ b/src/gallium/drivers/panfrost/pan_sfbd.c @@ -213,12 +213,14 @@ panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count) struct mali_single_framebuffer framebuffer = { .width = MALI_POSITIVE(width), .height = MALI_POSITIVE(height), - .unknown2 = 0x1f, + .shared_memory = { + .shared_workgroup_count = ~0, + .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu, + }, .format = { .unk3 = 0x3, }, .clear_flags = 0x1000, - .scratchpad = panfrost_batch_get_scratchpad(batch, shift, screen->thread_tls_alloc, screen->core_count)->gpu, .tiler = panfrost_emit_midg_tiler(batch, vertex_count), }; diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h index 1a59f4c77c8..8027abebf34 100644 --- a/src/panfrost/include/panfrost-job.h +++ b/src/panfrost/include/panfrost-job.h @@ -1470,14 +1470,41 @@ struct mali_sfbd_format { unsigned unk3 : 4; }; -struct mali_single_framebuffer { - u32 unknown1; - u32 unknown2; +/* Shared structure at the start of framebuffer descriptors, or used bare for + * compute jobs, configuring stack and shared memory */ + +struct mali_shared_memory { + u32 stack_shift : 4; + u32 unk0 : 28; + + /* Configuration for shared memory for compute shaders. + * shared_workgroup_count is logarithmic and may be computed for a + * compute shader using shared memory as: + * + * shared_workgroup_count = MAX2(ceil(log2(count_x)) + ... + ceil(log2(count_z), 10) + * + * For compute shaders that don't use shared memory, or non-compute + * shaders, this is set to ~0 + */ + + u32 shared_workgroup_count : 5; + u32 shared_unk1 : 3; + u32 shared_shift : 4; + u32 shared_zero : 20; + mali_ptr scratchpad; - u64 zero1; - u64 zero0; + /* For compute shaders, the RAM backing of workgroup-shared memory. For + * fragment shaders on Bifrost, apparently multisampling locations */ + + mali_ptr shared_memory; + mali_ptr unknown1; +} __attribute__((packed)); + + +struct mali_single_framebuffer { + struct mali_shared_memory shared_memory; struct mali_sfbd_format format; u32 clear_flags; @@ -1540,13 +1567,6 @@ struct mali_single_framebuffer { /* More below this, maybe */ } __attribute__((packed)); -/* On Midgard, this "framebuffer descriptor" is used for the framebuffer field - * of compute jobs. Superficially resembles a single framebuffer descriptor */ - -struct mali_compute_fbd { - u32 unknown1[8]; -} __attribute__((packed)); - /* Format bits for the render target flags */ #define MALI_MFBD_FORMAT_MSAA (1 << 1) @@ -1675,15 +1695,8 @@ struct bifrost_fb_extra { #define MALI_MFBD_EXTRA (1 << 13) struct bifrost_framebuffer { - u32 stack_shift : 4; - u32 unk0 : 28; + struct mali_shared_memory shared_memory; - u32 unknown2; // = 0x1f, same as SFBD - mali_ptr scratchpad; - - /* 0x10 */ - mali_ptr sample_locations; - mali_ptr unknown1; /* 0x20 */ u16 width1, height1; u32 zero3; diff --git a/src/panfrost/pandecode/decode.c b/src/panfrost/pandecode/decode.c index dc755fcc364..41b843f7a3b 100644 --- a/src/panfrost/pandecode/decode.c +++ b/src/panfrost/pandecode/decode.c @@ -666,6 +666,41 @@ pandecode_sfbd_format(struct mali_sfbd_format format) pandecode_log("},\n"); } +static void +pandecode_shared_memory(const struct mali_shared_memory *desc, bool is_compute) +{ + pandecode_prop("stack_shift = 0x%x", desc->stack_shift); + + if (desc->unk0) + pandecode_prop("unk0 = 0x%x", desc->unk0); + + if (desc->shared_workgroup_count != 0x1F) { + pandecode_prop("shared_workgroup_count = %d", desc->shared_workgroup_count); + if (!is_compute) + pandecode_msg("XXX: wrong workgroup count for noncompute\n"); + } + + if (desc->shared_unk1 || desc->shared_shift) { + pandecode_prop("shared_unk1 = %X", desc->shared_unk1); + pandecode_prop("shared_shift = %X", desc->shared_shift); + + if (!is_compute) + pandecode_msg("XXX: shared memory configured in noncompute shader"); + } + + if (desc->shared_zero) { + pandecode_msg("XXX: shared memory zero tripped\n"); + pandecode_prop("shared_zero = 0x%" PRIx32, desc->shared_zero); + } + + if (desc->shared_memory && !is_compute) + pandecode_msg("XXX: shared memory used in noncompute shader\n"); + + MEMORY_PROP(desc, scratchpad); + MEMORY_PROP(desc, shared_memory); + MEMORY_PROP(desc, unknown1); +} + static struct pandecode_fbd pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) { @@ -680,8 +715,11 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; - pandecode_prop("unknown1 = 0x%" PRIx32, s->unknown1); - pandecode_prop("unknown2 = 0x%" PRIx32, s->unknown2); + pandecode_log(".shared_memory = {\n"); + pandecode_indent++; + pandecode_shared_memory(&s->shared_memory, false); + pandecode_indent--; + pandecode_log("},\n"); pandecode_sfbd_format(s->format); @@ -748,7 +786,6 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) pandecode_prop("clear_stencil = 0x%x", s->clear_stencil); } - MEMORY_PROP(s, scratchpad); const struct midgard_tiler_descriptor t = s->tiler; bool has_hierarchy = !(gpu_id == 0x0720 || gpu_id == 0x0820 || gpu_id == 0x0830); @@ -757,8 +794,6 @@ pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id) pandecode_indent--; pandecode_log("};\n"); - pandecode_prop("zero0 = 0x%" PRIx64, s->zero0); - pandecode_prop("zero1 = 0x%" PRIx64, s->zero1); pandecode_prop("zero2 = 0x%" PRIx32, s->zero2); pandecode_prop("zero4 = 0x%" PRIx32, s->zero4); pandecode_prop("zero5 = 0x%" PRIx32, s->zero5); @@ -784,20 +819,13 @@ static void pandecode_compute_fbd(uint64_t gpu_va, int job_no) { struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va); - const struct mali_compute_fbd *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); + const struct mali_shared_memory *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va); - pandecode_log("struct mali_compute_fbd framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); + pandecode_log("struct mali_shared_memory shared_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; - - pandecode_log(".unknown1 = {"); - - for (int i = 0; i < ARRAY_SIZE(s->unknown1); ++i) - pandecode_log_cont("%X, ", s->unknown1[i]); - - pandecode_log("},\n"); - + pandecode_shared_memory(s, true); pandecode_indent--; - pandecode_log_cont("},\n"); + pandecode_log("},\n"); } /* Extracts the number of components associated with a Mali format */ @@ -1034,45 +1062,14 @@ pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment, bool is_comput struct pandecode_fbd info; - if (fb->sample_locations) { - /* The blob stores all possible sample locations in a single buffer - * allocated on startup, and just switches the pointer when switching - * MSAA state. For now, we just put the data into the cmdstream, but we - * should do something like what the blob does with a real driver. - * - * There seem to be 32 slots for sample locations, followed by another - * 16. The second 16 is just the center location followed by 15 zeros - * in all the cases I've identified (maybe shader vs. depth/color - * samples?). - */ - - struct pandecode_mapped_memory *smem = pandecode_find_mapped_gpu_mem_containing(fb->sample_locations); - - const u16 *PANDECODE_PTR_VAR(samples, smem, fb->sample_locations); - - pandecode_log("uint16_t sample_locations_%d[] = {\n", job_no); - pandecode_indent++; - - for (int i = 0; i < 32 + 16; i++) { - pandecode_log("%d, %d,\n", samples[2 * i], samples[2 * i + 1]); - } - - pandecode_indent--; - pandecode_log("};\n"); - } - pandecode_log("struct bifrost_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no); pandecode_indent++; - pandecode_prop("stack_shift = 0x%x", fb->stack_shift); - pandecode_prop("unk0 = 0x%x", fb->unk0); - - if (fb->sample_locations) - pandecode_prop("sample_locations = sample_locations_%d", job_no); - - /* Assume that unknown1 was emitted in the last job for - * now */ - MEMORY_PROP(fb, unknown1); + pandecode_log(".shared_memory = {\n"); + pandecode_indent++; + pandecode_shared_memory(&fb->shared_memory, is_compute); + pandecode_indent--; + pandecode_log("},\n"); info.width = fb->width1 + 1; info.height = fb->height1 + 1; @@ -1098,12 +1095,6 @@ pandecode_mfbd_bfr(uint64_t gpu_va, int job_no, bool is_fragment, bool is_comput if (fb->clear_depth) pandecode_prop("clear_depth = %f", fb->clear_depth); - /* TODO: What is this? Let's not blow up.. */ - if (fb->unknown2 != 0x1F) - pandecode_prop("unknown2 = 0x%x", fb->unknown2); - - pandecode_prop("unknown2 = 0x%x", fb->unknown2); - MEMORY_PROP(fb, scratchpad); const struct midgard_tiler_descriptor t = fb->tiler; if (!is_compute) pandecode_midgard_tiler_descriptor(&t, fb->width1 + 1, fb->height1 + 1, is_fragment, true); -- 2.30.2