From 70b06fb5d55d639fd74596a2ff6971cb57c030ca Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 19 Aug 2015 09:38:14 -0700
Subject: [PATCH] vc4: Convert blending to being done in 4x8 unorm normally.

We can't do this all the time, because you want blending to be done in
linear space, and sRGB would lose too much precision being done in 4x8.
The win on instructions is pretty huge when you can, though.

total uniforms in shared programs: 32065 -> 32168 (0.32%)
uniforms in affected programs:     327 -> 430 (31.50%)
total instructions in shared programs: 92644 -> 89830 (-3.04%)
instructions in affected programs:     15580 -> 12766 (-18.06%)

Improves openarena performance at 1920x1080 from 10.7fps to 11.2fps.
---
 src/gallium/drivers/vc4/vc4_context.h         |   5 +-
 src/gallium/drivers/vc4/vc4_nir_lower_blend.c | 286 +++++++++++++++---
 src/gallium/drivers/vc4/vc4_qir.h             |   2 +
 src/gallium/drivers/vc4/vc4_state.c           |   4 +-
 src/gallium/drivers/vc4/vc4_uniforms.c        |  30 +-
 5 files changed, 276 insertions(+), 51 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 7a758f8545f..86f2ce5e608 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -291,7 +291,10 @@ struct vc4_context {
 
         struct vc4_vertex_stateobj *vtx;
 
-        struct pipe_blend_color blend_color;
+        struct {
+                struct pipe_blend_color f;
+                uint8_t ub[4];
+        } blend_color;
         struct pipe_stencil_ref stencil_ref;
         unsigned sample_mask;
         struct pipe_framebuffer_state framebuffer;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 17b524653bb..373c9e12d11 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -86,11 +86,11 @@ vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
 }
 
 static nir_ssa_def *
-vc4_blend_channel(nir_builder *b,
-                  nir_ssa_def **src,
-                  nir_ssa_def **dst,
-                  unsigned factor,
-                  int channel)
+vc4_blend_channel_f(nir_builder *b,
+                    nir_ssa_def **src,
+                    nir_ssa_def **dst,
+                    unsigned factor,
+                    int channel)
 {
         switch(factor) {
         case PIPE_BLENDFACTOR_ONE:
@@ -146,8 +146,75 @@ vc4_blend_channel(nir_builder *b,
 }
 
 static nir_ssa_def *
-vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
-               unsigned func)
+vc4_nir_set_packed_chan(nir_builder *b, nir_ssa_def *src0, nir_ssa_def *src1,
+                        int chan)
+{
+        unsigned chan_mask = 0xff << (chan * 8);
+        return nir_ior(b,
+                       nir_iand(b, src0, nir_imm_int(b, ~chan_mask)),
+                       nir_iand(b, src1, nir_imm_int(b, chan_mask)));
+}
+
+static nir_ssa_def *
+vc4_blend_channel_i(nir_builder *b,
+                    nir_ssa_def *src,
+                    nir_ssa_def *dst,
+                    nir_ssa_def *src_a,
+                    nir_ssa_def *dst_a,
+                    unsigned factor,
+                    int a_chan)
+{
+        switch (factor) {
+        case PIPE_BLENDFACTOR_ONE:
+                return nir_imm_int(b, ~0);
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return src;
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return src_a;
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return dst_a;
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return dst;
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                return vc4_nir_set_packed_chan(b,
+                                               nir_umin_4x8(b,
+                                                            src_a,
+                                                            nir_inot(b, dst_a)),
+                                               nir_imm_int(b, ~0),
+                                               a_chan);
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_RGBA);
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_AAAA);
+        case PIPE_BLENDFACTOR_ZERO:
+                return nir_imm_int(b, 0);
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return nir_inot(b, src);
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return nir_inot(b, src_a);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return nir_inot(b, dst_a);
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return nir_inot(b, dst);
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return nir_inot(b, vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_RGBA));
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return nir_inot(b, vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_AAAA));
+
+        default:
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend factor %d\n", factor);
+                return nir_imm_int(b, ~0);
+        }
+}
+
+static nir_ssa_def *
+vc4_blend_func_f(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+                 unsigned func)
 {
         switch (func) {
         case PIPE_BLEND_ADD:
@@ -169,9 +236,33 @@ vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
         }
 }
 
+static nir_ssa_def *
+vc4_blend_func_i(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+                 unsigned func)
+{
+        switch (func) {
+        case PIPE_BLEND_ADD:
+                return nir_usadd_4x8(b, src, dst);
+        case PIPE_BLEND_SUBTRACT:
+                return nir_ussub_4x8(b, src, dst);
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+                return nir_ussub_4x8(b, dst, src);
+        case PIPE_BLEND_MIN:
+                return nir_umin_4x8(b, src, dst);
+        case PIPE_BLEND_MAX:
+                return nir_umax_4x8(b, src, dst);
+
+        default:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend func %d\n", func);
+                return src;
+
+        }
+}
+
 static void
-vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
-                nir_ssa_def **src_color, nir_ssa_def **dst_color)
+vc4_do_blending_f(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
+                  nir_ssa_def **src_color, nir_ssa_def **dst_color)
 {
         struct pipe_rt_blend_state *blend = &c->fs_key->blend;
 
@@ -192,20 +283,106 @@ vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
                 int dst_factor = ((i != 3) ? blend->rgb_dst_factor :
                                   blend->alpha_dst_factor);
                 src_blend[i] = nir_fmul(b, src_color[i],
-                                        vc4_blend_channel(b,
-                                                          src_color, dst_color,
-                                                          src_factor, i));
+                                        vc4_blend_channel_f(b,
+                                                            src_color, dst_color,
+                                                            src_factor, i));
                 dst_blend[i] = nir_fmul(b, dst_color[i],
-                                        vc4_blend_channel(b,
-                                                          src_color, dst_color,
-                                                          dst_factor, i));
+                                        vc4_blend_channel_f(b,
+                                                            src_color, dst_color,
+                                                            dst_factor, i));
         }
 
         for (int i = 0; i < 4; i++) {
-                result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i],
-                                           ((i != 3) ? blend->rgb_func :
-                                            blend->alpha_func));
+                result[i] = vc4_blend_func_f(b, src_blend[i], dst_blend[i],
+                                             ((i != 3) ? blend->rgb_func :
+                                              blend->alpha_func));
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_splat(nir_builder *b, nir_ssa_def *src)
+{
+        nir_ssa_def *or1 = nir_ior(b, src, nir_ishl(b, src, nir_imm_int(b, 8)));
+        return nir_ior(b, or1, nir_ishl(b, or1, nir_imm_int(b, 16)));
+}
+
+static nir_ssa_def *
+vc4_do_blending_i(struct vc4_compile *c, nir_builder *b,
+                  nir_ssa_def *src_color, nir_ssa_def *dst_color,
+                  nir_ssa_def *src_float_a)
+{
+        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
+
+        if (!blend->blend_enable)
+                return src_color;
+
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+        nir_ssa_def *imm_0xff = nir_imm_int(b, 0xff);
+        nir_ssa_def *src_a = nir_pack_unorm_4x8(b, src_float_a);
+        nir_ssa_def *dst_a;
+        int alpha_chan;
+        for (alpha_chan = 0; alpha_chan < 4; alpha_chan++) {
+                if (format_swiz[alpha_chan] == 3)
+                        break;
+        }
+        if (alpha_chan != 4) {
+                nir_ssa_def *shift = nir_imm_int(b, alpha_chan * 8);
+                dst_a = vc4_nir_splat(b, nir_iand(b, nir_ushr(b, dst_color,
+                                                              shift), imm_0xff));
+        } else {
+                dst_a = nir_imm_int(b, ~0);
+        }
+
+        nir_ssa_def *src_factor = vc4_blend_channel_i(b,
+                                                      src_color, dst_color,
+                                                      src_a, dst_a,
+                                                      blend->rgb_src_factor,
+                                                      alpha_chan);
+        nir_ssa_def *dst_factor = vc4_blend_channel_i(b,
+                                                      src_color, dst_color,
+                                                      src_a, dst_a,
+                                                      blend->rgb_dst_factor,
+                                                      alpha_chan);
+
+        if (alpha_chan != 4 &&
+            blend->alpha_src_factor != blend->rgb_src_factor) {
+                nir_ssa_def *src_alpha_factor =
+                        vc4_blend_channel_i(b,
+                                            src_color, dst_color,
+                                            src_a, dst_a,
+                                            blend->alpha_src_factor,
+                                            alpha_chan);
+                src_factor = vc4_nir_set_packed_chan(b, src_factor,
+                                                     src_alpha_factor,
+                                                     alpha_chan);
+        }
+        if (alpha_chan != 4 &&
+            blend->alpha_dst_factor != blend->rgb_dst_factor) {
+                nir_ssa_def *dst_alpha_factor =
+                        vc4_blend_channel_i(b,
+                                            src_color, dst_color,
+                                            src_a, dst_a,
+                                            blend->alpha_dst_factor,
+                                            alpha_chan);
+                dst_factor = vc4_nir_set_packed_chan(b, dst_factor,
+                                                     dst_alpha_factor,
+                                                     alpha_chan);
+        }
+        nir_ssa_def *src_blend = nir_umul_unorm_4x8(b, src_color, src_factor);
+        nir_ssa_def *dst_blend = nir_umul_unorm_4x8(b, dst_color, dst_factor);
+
+        nir_ssa_def *result =
+                vc4_blend_func_i(b, src_blend, dst_blend, blend->rgb_func);
+        if (alpha_chan != 4 && blend->alpha_func != blend->rgb_func) {
+                nir_ssa_def *result_a = vc4_blend_func_i(b,
+                                                         src_blend,
+                                                         dst_blend,
+                                                         blend->alpha_func);
+                result = vc4_nir_set_packed_chan(b, result, result_a,
+                                                 alpha_chan);
         }
+        return result;
 }
 
 static nir_ssa_def *
@@ -299,12 +476,33 @@ vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
         nir_builder_instr_insert(b, &discard->instr);
 }
 
+static nir_ssa_def *
+vc4_nir_swizzle_and_pack(struct vc4_compile *c, nir_builder *b,
+                         nir_ssa_def **colors)
+{
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+
+        nir_ssa_def *swizzled[4];
+        for (int i = 0; i < 4; i++) {
+                swizzled[i] = vc4_nir_get_swizzled_channel(b, colors,
+                                                           format_swiz[i]);
+        }
+
+        return nir_pack_unorm_4x8(b,
+                                  nir_vec4(b,
+                                           swizzled[0], swizzled[1],
+                                           swizzled[2], swizzled[3]));
+
+}
+
 static void
 vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                           nir_intrinsic_instr *intr)
 {
         enum pipe_format color_format = c->fs_key->color_format;
         const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+        bool srgb = util_format_is_srgb(color_format);
 
         /* Pull out the float src/dst color components. */
         nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
@@ -315,45 +513,39 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                 unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
         }
 
-        /* Unswizzle the destination color. */
-        nir_ssa_def *dst_color[4];
-        for (unsigned i = 0; i < 4; i++) {
-                dst_color[i] = vc4_nir_get_swizzled_channel(b,
-                                                            unpacked_dst_color,
-                                                            format_swiz[i]);
-        }
-
         vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
 
-        /* Turn dst color to linear. */
-        if (util_format_is_srgb(color_format)) {
+        nir_ssa_def *packed_color;
+        if (srgb) {
+                /* Unswizzle the destination color. */
+                nir_ssa_def *dst_color[4];
+                for (unsigned i = 0; i < 4; i++) {
+                        dst_color[i] = vc4_nir_get_swizzled_channel(b,
+                                                                    unpacked_dst_color,
+                                                                    format_swiz[i]);
+                }
+
+                /* Turn dst color to linear. */
                 for (int i = 0; i < 3; i++)
                         dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
-        }
 
-        nir_ssa_def *blend_color[4];
-        vc4_do_blending(c, b, blend_color, src_color, dst_color);
+                nir_ssa_def *blend_color[4];
+                vc4_do_blending_f(c, b, blend_color, src_color, dst_color);
 
-        /* sRGB encode the output color */
-        if (util_format_is_srgb(color_format)) {
+                /* sRGB encode the output color */
                 for (int i = 0; i < 3; i++)
                         blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
-        }
 
-        nir_ssa_def *swizzled_outputs[4];
-        for (int i = 0; i < 4; i++) {
-                swizzled_outputs[i] =
-                        vc4_nir_get_swizzled_channel(b, blend_color,
-                                                     format_swiz[i]);
-        }
+                packed_color = vc4_nir_swizzle_and_pack(c, b, blend_color);
+        } else {
+                nir_ssa_def *packed_src_color =
+                        vc4_nir_swizzle_and_pack(c, b, src_color);
 
-        nir_ssa_def *packed_color =
-                nir_pack_unorm_4x8(b,
-                                   nir_vec4(b,
-                                            swizzled_outputs[0],
-                                            swizzled_outputs[1],
-                                            swizzled_outputs[2],
-                                            swizzled_outputs[3]));
+                packed_color =
+                        vc4_do_blending_i(c, b,
+                                          packed_src_color, packed_dst_color,
+                                          src_color[3]);
+        }
 
         packed_color = vc4_logicop(b, c->fs_key->logicop_func,
                                    packed_color, packed_dst_color);
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index fce24cd2330..fa1b50f3d10 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -253,6 +253,8 @@ enum quniform_contents {
         QUNIFORM_BLEND_CONST_COLOR_Y,
         QUNIFORM_BLEND_CONST_COLOR_Z,
         QUNIFORM_BLEND_CONST_COLOR_W,
+        QUNIFORM_BLEND_CONST_COLOR_RGBA,
+        QUNIFORM_BLEND_CONST_COLOR_AAAA,
 
         QUNIFORM_STENCIL,
 
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 8a759c2ca4c..147694644f0 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -51,7 +51,9 @@ vc4_set_blend_color(struct pipe_context *pctx,
                     const struct pipe_blend_color *blend_color)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
-        vc4->blend_color = *blend_color;
+        vc4->blend_color.f = *blend_color;
+        for (int i = 0; i < 4; i++)
+                vc4->blend_color.ub[i] = float_to_ubyte(blend_color->color[i]);
         vc4->dirty |= VC4_DIRTY_BLEND_COLOR;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 85d6998205e..f5ad481f186 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -262,11 +262,35 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                 case QUNIFORM_BLEND_CONST_COLOR_Z:
                 case QUNIFORM_BLEND_CONST_COLOR_W:
                         cl_aligned_f(&uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->contents[i] -
-                                                                  QUNIFORM_BLEND_CONST_COLOR_X],
+                                     CLAMP(vc4->blend_color.f.color[uinfo->contents[i] -
+                                                                    QUNIFORM_BLEND_CONST_COLOR_X],
                                            0, 1));
                         break;
 
+                case QUNIFORM_BLEND_CONST_COLOR_RGBA: {
+                        const uint8_t *format_swiz =
+                                vc4_get_format_swizzle(vc4->framebuffer.cbufs[0]->format);
+                        uint32_t color = 0;
+                        for (int i = 0; i < 4; i++) {
+                                if (format_swiz[i] >= 4)
+                                        continue;
+
+                                color |= (vc4->blend_color.ub[format_swiz[i]] <<
+                                          (i * 8));
+                        }
+                        cl_aligned_u32(&uniforms, color);
+                        break;
+                }
+
+                case QUNIFORM_BLEND_CONST_COLOR_AAAA: {
+                        uint8_t a = vc4->blend_color.ub[3];
+                        cl_aligned_u32(&uniforms, ((a) |
+                                                   (a << 8) |
+                                                   (a << 16) |
+                                                   (a << 24)));
+                        break;
+                }
+
                 case QUNIFORM_STENCIL:
                         cl_aligned_u32(&uniforms,
                                        vc4->zsa->stencil_uniforms[uinfo->data[i]] |
@@ -330,6 +354,8 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                 case QUNIFORM_BLEND_CONST_COLOR_Y:
                 case QUNIFORM_BLEND_CONST_COLOR_Z:
                 case QUNIFORM_BLEND_CONST_COLOR_W:
+                case QUNIFORM_BLEND_CONST_COLOR_RGBA:
+                case QUNIFORM_BLEND_CONST_COLOR_AAAA:
                         dirty |= VC4_DIRTY_BLEND_COLOR;
                         break;
 
-- 
2.30.2