From d58600c0e0af98fa2173aaa4dc996ea71502208a Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Tue, 31 Dec 2019 21:37:30 -0500
Subject: [PATCH] panfrost: Pack MRT blend shaders into a single BO

Blend shader size and location in memory is considerably constrained,
probably to facilitate optimizations (my guess is that blend shaders are
run strictly out of i-cache). We need to pack the blend shaders for each
RT of a single framebuffer together. The easiest way to do this is at
draw time which is not terribly efficient but will hold us over for now.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
---
 src/gallium/drivers/panfrost/pan_blend.h     |  6 ++--
 src/gallium/drivers/panfrost/pan_blend_cso.c | 29 +++++++++++++-------
 src/gallium/drivers/panfrost/pan_context.c   | 25 +++++++++++------
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/panfrost/pan_blend.h b/src/gallium/drivers/panfrost/pan_blend.h
index 83fe35e325c..1b1cb9df31d 100644
--- a/src/gallium/drivers/panfrost/pan_blend.h
+++ b/src/gallium/drivers/panfrost/pan_blend.h
@@ -55,8 +55,8 @@ struct panfrost_blend_shader {
 /* A blend shader descriptor ready for actual use */
 
 struct panfrost_blend_shader_final {
-        /* The compiled shader in GPU memory, possibly patched */
-        struct panfrost_bo *bo;
+        /* GPU address where we're compiled to */
+        uint64_t gpu;
 
         /* First instruction tag (for tagging the pointer) */
         unsigned first_tag;
@@ -113,6 +113,6 @@ void
 panfrost_blend_context_init(struct pipe_context *pipe);
 
 struct panfrost_blend_final
-panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rt);
+panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rt, struct panfrost_bo **bo, unsigned *shader_offset);
 
 #endif
diff --git a/src/gallium/drivers/panfrost/pan_blend_cso.c b/src/gallium/drivers/panfrost/pan_blend_cso.c
index 1dd211e6b4c..b6c46323020 100644
--- a/src/gallium/drivers/panfrost/pan_blend_cso.c
+++ b/src/gallium/drivers/panfrost/pan_blend_cso.c
@@ -227,7 +227,7 @@ panfrost_blend_constant(float *out, float *in, unsigned mask)
 /* Create a final blend given the context */
 
 struct panfrost_blend_final
-panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
+panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti, struct panfrost_bo **bo, unsigned *shader_offset)
 {
         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 
@@ -273,23 +273,32 @@ panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
         final.shader.work_count = shader->work_count;
         final.shader.first_tag = shader->first_tag;
 
-        /* Upload the shader */
-        final.shader.bo = panfrost_batch_create_bo(batch, shader->size,
-                                                   PAN_BO_EXECUTE,
-                                                   PAN_BO_ACCESS_PRIVATE |
-                                                   PAN_BO_ACCESS_READ |
-                                                   PAN_BO_ACCESS_VERTEX_TILER |
-                                                   PAN_BO_ACCESS_FRAGMENT);
-        memcpy(final.shader.bo->cpu, shader->buffer, shader->size);
+        /* Upload the shader, sharing a BO */
+        if (!(*bo)) {
+                *bo = panfrost_batch_create_bo(batch, 4096,
+                   PAN_BO_EXECUTE,
+                   PAN_BO_ACCESS_PRIVATE |
+                   PAN_BO_ACCESS_READ |
+                   PAN_BO_ACCESS_VERTEX_TILER |
+                   PAN_BO_ACCESS_FRAGMENT);
+        }
+
+        /* Size check */
+        assert((*shader_offset + shader->size) < 4096);
+
+        memcpy((*bo)->cpu + *shader_offset, shader->buffer, shader->size);
+        final.shader.gpu = (*bo)->gpu + *shader_offset;
 
         if (shader->patch_index) {
                 /* We have to specialize the blend shader to use constants, so
                  * patch in the current constants */
 
-                float *patch = (float *) (final.shader.bo->cpu + shader->patch_index);
+                float *patch = (float *) ((*bo)->cpu + *shader_offset + shader->patch_index);
                 memcpy(patch, ctx->blend_color.color, sizeof(float) * 4);
         }
 
+        *shader_offset += shader->size;
+
         return final;
 }
 
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index 3a8d21d1d8f..8c911b5d001 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -936,9 +936,12 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
 
                 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
+                unsigned shader_offset = 0;
+                struct panfrost_bo *shader_bo = NULL;
 
-                for (unsigned c = 0; c < rt_count; ++c)
-                        blend[c] = panfrost_get_blend_for_context(ctx, c);
+                for (unsigned c = 0; c < rt_count; ++c) {
+                        blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo, &shader_offset);
+                }
 
                 /* If there is a blend shader, work registers are shared. XXX: opt */
 
@@ -979,13 +982,17 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                 /* Even on MFBD, the shader descriptor gets blend shaders. It's
                  * *also* copied to the blend_meta appended (by convention),
                  * but this is the field actually read by the hardware. (Or
-                 * maybe both are read...?) */
+                 * maybe both are read...?). Specify the last RTi with a blend
+                 * shader. */
 
-                if (blend[0].is_shader) {
-                        ctx->fragment_shader_core.blend.shader =
-                                blend[0].shader.bo->gpu | blend[0].shader.first_tag;
-                } else {
-                        ctx->fragment_shader_core.blend.shader = 0;
+                ctx->fragment_shader_core.blend.shader = 0;
+
+                for (signed rt = (rt_count - 1); rt >= 0; --rt) {
+                        if (blend[rt].is_shader) {
+                                ctx->fragment_shader_core.blend.shader =
+                                        blend[rt].shader.gpu | blend[rt].shader.first_tag;
+                                break;
+                        }
                 }
 
                 if (screen->quirks & MIDGARD_SFBD) {
@@ -1039,7 +1046,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                 assert(!(is_srgb && blend[i].is_shader));
 
                                 if (blend[i].is_shader) {
-                                        rts[i].blend.shader = blend[i].shader.bo->gpu | blend[i].shader.first_tag;
+                                        rts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
                                 } else {
                                         rts[i].blend.equation = *blend[i].equation.equation;
                                         rts[i].blend.constant = blend[i].equation.constant;
-- 
2.30.2