From b5957cee920cd7a62e4e726538dbbe44c12e33ab Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 18 Nov 2017 06:23:35 +0100
Subject: [PATCH] llvmpipe: fix snorm blending

The blend math gets a bit funky due to inverse blend factors being
in range [0,2] rather than [-1,1], our normalized math can't really
cover this.
src_alpha_saturate blend factor has a similar problem too.
(Note that piglit fbo-blending-formats test is mostly useless for
anything but unorm formats, since not just all src/dst values are
between [0,1], but the tests are crafted in a way that the results
are between [0,1] too.)

v2: some formatting fixes, and fix a fairly obscure (to debug)
issue with alpha-only formats (not related to snorm at all), where
blend optimization would think it could simplify the blend equation
if the blend factors were complementary, however was using the
completely unrelated rgb blend factors instead of the alpha ones...

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c   |  50 +++----
 src/gallium/auxiliary/gallivm/lp_bld_arit.h   |   7 +
 src/gallium/drivers/llvmpipe/lp_bld_blend.c   | 134 ++++++++++++++++--
 .../drivers/llvmpipe/lp_bld_blend_aos.c       |  53 ++++---
 4 files changed, 191 insertions(+), 53 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index a1edd349f1f..321c6e4edf0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -541,38 +541,38 @@ lp_build_add(struct lp_build_context *bld,
    assert(lp_check_value(type, a));
    assert(lp_check_value(type, b));
 
-   if(a == bld->zero)
+   if (a == bld->zero)
       return b;
-   if(b == bld->zero)
+   if (b == bld->zero)
       return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
       return bld->undef;
 
-   if(bld->type.norm) {
+   if (type.norm) {
       const char *intrinsic = NULL;
 
-      if(a == bld->one || b == bld->one)
+      if (!type.sign && (a == bld->one || b == bld->one))
         return bld->one;
 
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
-            if(util_cpu_caps.has_sse2) {
-              if(type.width == 8)
+            if (util_cpu_caps.has_sse2) {
+              if (type.width == 8)
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
             } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
             }
          }
          if (type.width * type.length == 256) {
-            if(util_cpu_caps.has_avx2) {
-              if(type.width == 8)
+            if (util_cpu_caps.has_avx2) {
+              if (type.width == 8)
                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
             }
          }
@@ -842,38 +842,38 @@ lp_build_sub(struct lp_build_context *bld,
    assert(lp_check_value(type, a));
    assert(lp_check_value(type, b));
 
-   if(b == bld->zero)
+   if (b == bld->zero)
       return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
       return bld->undef;
-   if(a == b)
+   if (a == b)
       return bld->zero;
 
-   if(bld->type.norm) {
+   if (type.norm) {
       const char *intrinsic = NULL;
 
-      if(b == bld->one)
+      if (!type.sign && b == bld->one)
         return bld->zero;
 
       if (!type.floating && !type.fixed) {
          if (type.width * type.length == 128) {
             if (util_cpu_caps.has_sse2) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
             } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
             }
          }
          if (type.width * type.length == 256) {
             if (util_cpu_caps.has_avx2) {
-              if(type.width == 8)
+              if (type.width == 8)
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
-              if(type.width == 16)
+              if (type.width == 16)
                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
             }
          }
@@ -963,7 +963,7 @@ lp_build_sub(struct lp_build_context *bld,
  * @sa Michael Herf, The "double blend trick", May 2000, 
  *     http://www.stereopsis.com/doubleblend.html
  */
-static LLVMValueRef
+LLVMValueRef
 lp_build_mul_norm(struct gallivm_state *gallivm,
                   struct lp_type wide_type,
                   LLVMValueRef a, LLVMValueRef b)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 2a4137a6780..f5b2800a2cf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -71,6 +71,13 @@ lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
 
+
+LLVMValueRef
+lp_build_mul_norm(struct gallivm_state *gallivm,
+                  struct lp_type wide_type,
+                  LLVMValueRef a,
+                  LLVMValueRef b);
+
 LLVMValueRef
 lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
index 1feb415c9e5..02ec55eddbd 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -35,6 +35,7 @@
 #include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_pack.h"
 
 #include "lp_bld_blend.h"
 
@@ -65,11 +66,11 @@ lp_build_blend_func_commutative(unsigned func)
 boolean
 lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 {
-   if(rgb_func == alpha_func)
+   if (rgb_func == alpha_func)
       return FALSE;
-   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+   if (rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
       return TRUE;
-   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+   if (rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
       return TRUE;
    return FALSE;
 }
@@ -81,10 +82,64 @@ lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 static inline boolean
 lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
 {
+   STATIC_ASSERT((PIPE_BLENDFACTOR_ZERO ^ 0x10) == PIPE_BLENDFACTOR_ONE);
+   STATIC_ASSERT((PIPE_BLENDFACTOR_CONST_COLOR ^ 0x10) ==
+                 PIPE_BLENDFACTOR_INV_CONST_COLOR);
    return dst_factor == (src_factor ^ 0x10);
 }
 
 
+/**
+ * Whether this is a inverse blend factor
+ */
+static inline boolean
+is_inverse_factor(unsigned factor)
+{
+   STATIC_ASSERT(PIPE_BLENDFACTOR_ZERO == 0x11);
+   return factor > 0x11;
+}
+
+
+/**
+ * Calculates the (expanded to wider type) multiplication
+ * of 2 normalized numbers.
+ */
+static void
+lp_build_mul_norm_expand(struct lp_build_context *bld,
+                         LLVMValueRef a, LLVMValueRef b,
+                         LLVMValueRef *resl, LLVMValueRef *resh,
+                         boolean signedness_differs)
+{
+   const struct lp_type type = bld->type;
+   struct lp_type wide_type = lp_wider_type(type);
+   struct lp_type wide_type2 = wide_type;
+   struct lp_type type2 = type;
+   LLVMValueRef al, ah, bl, bh;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+   assert(!type.floating && !type.fixed && type.norm);
+
+   if (a == bld->zero || b == bld->zero) {
+      LLVMValueRef zero = LLVMConstNull(lp_build_vec_type(bld->gallivm, wide_type));
+      *resl = zero;
+      *resh = zero;
+      return;
+   }
+
+   if (signedness_differs) {
+      type2.sign = !type.sign;
+      wide_type2.sign = !wide_type2.sign;
+   }
+
+   lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+   lp_build_unpack2_native(bld->gallivm, type2, wide_type2, b, &bl, &bh);
+
+   *resl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
+   *resh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
+}
+
+
 /**
  * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
  */
@@ -155,7 +210,7 @@ lp_build_blend(struct lp_build_context *bld,
             } else {
                return lp_build_lerp(bld, dst_factor, src, dst, 0);
             }
-         } else if(bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
+         } else if (bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
             result = lp_build_add(bld, src, dst);
 
             if (factor_src < factor_dst) {
@@ -165,7 +220,7 @@ lp_build_blend(struct lp_build_context *bld,
                result = lp_build_mul(bld, result, dst_factor);
                return lp_build_sub(bld, src, result);
             }
-         } else if(bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
+         } else if (bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
             result = lp_build_add(bld, src, dst);
 
             if (factor_src < factor_dst) {
@@ -192,9 +247,72 @@ lp_build_blend(struct lp_build_context *bld,
    if (optimise_only)
       return NULL;
 
-   src_term = lp_build_mul(bld, src, src_factor);
-   dst_term = lp_build_mul(bld, dst, dst_factor);
-   return lp_build_blend_func(bld, func, src_term, dst_term);
+   if ((bld->type.norm && bld->type.sign) &&
+       (is_inverse_factor(factor_src) || is_inverse_factor(factor_dst))) {
+      /*
+       * With snorm blending, the inverse blend factors range from [0,2]
+       * instead of [-1,1], so the ordinary signed normalized arithmetic
+       * doesn't quite work. Unpack must be unsigned, and the add/sub
+       * must be done with wider type.
+       * (Note that it's not quite obvious what the blend equation wrt to
+       * clamping should actually be based on GL spec in this case, but
+       * really the incoming src values are clamped to [-1,1] (the dst is
+       * always clamped already), and then NO further clamping occurs until
+       * the end.)
+       */
+      struct lp_build_context bldw;
+      struct lp_type wide_type = lp_wider_type(bld->type);
+      LLVMValueRef src_terml, src_termh, dst_terml, dst_termh;
+      LLVMValueRef resl, resh;
+
+      /*
+       * We don't need saturate math for the sub/add, since we have
+       * x+1 bit numbers in x*2 wide type (result is x+2 bits).
+       * (Doesn't really matter on x86 sse2 though as we use saturated
+       * intrinsics.)
+       */
+      wide_type.norm = 0;
+      lp_build_context_init(&bldw, bld->gallivm, wide_type);
+
+      /*
+       * XXX This is a bit hackish. Note that -128 really should
+       * be -1.0, the same as -127. However, we did not actually clamp
+       * things anywhere (relying on pack intrinsics instead) therefore
+       * we will get -128, and the inverted factor then 255. But the mul
+       * can overflow in this case (rather the rounding fixups for the mul,
+       * -128*255 will be positive).
+       * So we clamp the src and dst up here but only when necessary (we
+       * should do this before calculating blend factors but it's enough
+       * for avoiding overflow).
+       */
+      if (is_inverse_factor(factor_src)) {
+         src = lp_build_max(bld, src,
+                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+      }
+      if (is_inverse_factor(factor_dst)) {
+         dst = lp_build_max(bld, dst,
+                            lp_build_const_vec(bld->gallivm, bld->type, -1.0));
+      }
+
+      lp_build_mul_norm_expand(bld, src, src_factor, &src_terml, &src_termh,
+                               is_inverse_factor(factor_src) ? TRUE : FALSE);
+      lp_build_mul_norm_expand(bld, dst, dst_factor, &dst_terml, &dst_termh,
+                               is_inverse_factor(factor_dst) ? TRUE : FALSE);
+      resl = lp_build_blend_func(&bldw, func, src_terml, dst_terml);
+      resh = lp_build_blend_func(&bldw, func, src_termh, dst_termh);
+
+      /*
+       * XXX pack2_native is not ok because the values have to be in dst
+       * range. We need native pack though for the correct order on avx2.
+       * Will break on everything not implementing clamping pack intrinsics
+       * (i.e. everything but sse2 and altivec).
+       */
+      return lp_build_pack2_native(bld->gallivm, wide_type, bld->type, resl, resh);
+   } else {
+      src_term = lp_build_mul(bld, src, src_factor);
+      dst_term = lp_build_mul(bld, dst, dst_factor);
+      return lp_build_blend_func(bld, func, src_term, dst_term);
+   }
 }
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index 45c5c2bb65e..c16ef1a2e91 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -112,22 +112,34 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
    case PIPE_BLENDFACTOR_DST_ALPHA:
       return bld->dst;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      if(alpha)
+      if (alpha)
          return bld->base.one;
       else {
          /*
           * If there's no dst alpha the complement is zero but for unclamped
-          * float inputs min can be non-zero (negative).
+          * float inputs (or snorm inputs) min can be non-zero (negative).
           */
-         if (!bld->has_dst_alpha) {
-            if (!bld->saturate)
+         if (!bld->saturate) {
+            if (!bld->has_dst_alpha) {
                bld->saturate = lp_build_min(&bld->base, src_alpha, bld->base.zero);
-         }
-         else {
-            if(!bld->inv_dst)
-               bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
-            if(!bld->saturate)
+            }
+            else if (bld->base.type.norm && bld->base.type.sign) {
+               /*
+                * The complement/min totally doesn't work, since
+                * the complement is in range [0,2] but the other
+                * min input is [-1,1]. However, we can just clamp to 0
+                * before doing the complement...
+                */
+               LLVMValueRef inv_dst;
+               inv_dst = lp_build_max(&bld->base, bld->base.zero, bld->dst);
+               inv_dst = lp_build_comp(&bld->base, inv_dst);
+               bld->saturate = lp_build_min(&bld->base, src_alpha, inv_dst);
+            } else {
+               if (!bld->inv_dst) {
+                  bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+               }
                bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst);
+            }
          }
          return bld->saturate;
       }
@@ -140,24 +152,24 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
       return src1_alpha;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      if(!bld->inv_src)
+      if (!bld->inv_src)
          bld->inv_src = lp_build_comp(&bld->base, bld->src);
       return bld->inv_src;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      if(!bld->inv_src_alpha)
+      if (!bld->inv_src_alpha)
          bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha);
       return bld->inv_src_alpha;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      if(!bld->inv_dst)
+      if (!bld->inv_dst)
          bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
       return bld->inv_dst;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      if(!bld->inv_const)
+      if (!bld->inv_const)
          bld->inv_const = lp_build_comp(&bld->base, bld->const_);
       return bld->inv_const;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      if(!bld->inv_const_alpha)
+      if (!bld->inv_const_alpha)
          bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
       return bld->inv_const_alpha;
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
@@ -331,7 +343,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
    bld.const_alpha = const_alpha;
    bld.has_dst_alpha = FALSE;
 
-   /* Find the alpha channel if not provided seperately */
+   /* Find the alpha channel if not provided separately */
    if (!src_alpha) {
       for (i = 0; i < 4; ++i) {
          if (swizzle[i] == 3) {
@@ -349,7 +361,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
    }
 
    if (blend->logicop_enable) {
-      if(!type.floating) {
+      if (!type.floating) {
          result = lp_build_logicop(gallivm->builder, blend->logicop_func, src, dst);
       }
       else {
@@ -361,6 +373,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
       boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor &&
                                 state->alpha_src_factor == state->alpha_dst_factor) ||
                                nr_channels == 1;
+      boolean alpha_only = nr_channels == 1 && alpha_swizzle == PIPE_SWIZZLE_X;
 
       src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
                                          state->alpha_src_factor,
@@ -374,8 +387,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
 
       result = lp_build_blend(&bld.base,
                               state->rgb_func,
-                              state->rgb_src_factor,
-                              state->rgb_dst_factor,
+                              alpha_only ? state->alpha_src_factor : state->rgb_src_factor,
+                              alpha_only ? state->alpha_dst_factor : state->rgb_dst_factor,
                               src,
                               dst,
                               src_factor,
@@ -383,8 +396,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
                               rgb_alpha_same,
                               false);
 
-      if(state->rgb_func != state->alpha_func && nr_channels > 1 &&
-                            alpha_swizzle != PIPE_SWIZZLE_NONE) {
+      if (state->rgb_func != state->alpha_func && nr_channels > 1 &&
+          alpha_swizzle != PIPE_SWIZZLE_NONE) {
          LLVMValueRef alpha;
 
          alpha = lp_build_blend(&bld.base,
-- 
2.30.2