From 49ec647c3b724c53f4be45ed97f8d8b1046e7ccd Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 6 Jan 2016 01:05:35 +0100
Subject: [PATCH] llvmpipe: avoid most 64 bit math in rasterization

The trick here is to recognize that in the c + n * dcdx calculations,
not only can the lower FIXED_ORDER bits not change (as the dcdx values
have those all zero) but that this means the sign bit of the calculations
cannot be different as well, that is
sign(c + n*dcdx) == sign((c >> FIXED_ORDER) + n*(dcdx >> FIXED_ORDER)).
That shaves off more than enough bits to never require 64bit masks.
A shifted plane c value could still easily exceed 32 bits, however since we
throw out planes which are trivial accept even before binning (and similarly
don't even get to see tris for which there was a trivial reject plane)) this
is never a problem.
The idea isnt't all that revolutionary, in fact something similar was tried
ages ago (9773722c2b09d5f0615a47cecf4347859474dc56) back when the values were
only 32 bit anyway. I believe now it didn't quite work then because the
adjustment needed for testing trivial reject / partial masks wasn't handled
correctly.
This still keeps the separate 32/64 bit paths for now, as the 32 bit one still
looks minimally simpler (and also because if we'd pass in dcdx/dcdy/eo unscaled
from setup which would be a good reason to ditch the 32 bit path, we'd need to
change the special-purpose rasterization functions for small tris).

This passes piglit triangle-rasterization (-fbo -auto -max_size
-subpixelbits 8) and triangle-rasterization-overdraw (with some hacks
to make it work correctly with large sizes) easily (full piglit as
well of course, but most tests wouldn't use triangles large enough to
be affected, that is tris with a bounding box over 128x128).
The profiler says indeed time spent in rast_tri functions is reduced
substantially, BUT of course only if the tris are large. I measured a 3%
improvement in mesa gloss demo when supersized to twice the screen size...

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c    | 101 +++++++++--------
 .../drivers/llvmpipe/lp_rast_tri_tmp.h        | 107 +++++++++++++++---
 2 files changed, 143 insertions(+), 65 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 232c8599e42..0ae6ec28d35 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -64,43 +64,43 @@ block_full_16(struct lp_rasterizer_task *task,
 }
 
 static inline unsigned
-build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
+build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
 {
    unsigned mask = 0;
 
-   int64_t c0 = c;
-   int64_t c1 = c0 + dcdy;
-   int64_t c2 = c1 + dcdy;
-   int64_t c3 = c2 + dcdy;
-
-   mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0);
-   mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1);
-   mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2);
-   mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3);
-   mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4);
-   mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5);
-   mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6);
-   mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7);
-   mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8);
-   mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9);
-   mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10);
-   mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11);
-   mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12);
-   mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13);
-   mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14);
-   mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15);
+   int32_t c0 = c;
+   int32_t c1 = c0 + dcdy;
+   int32_t c2 = c1 + dcdy;
+   int32_t c3 = c2 + dcdy;
+
+   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
+   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
+   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
+   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
+   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
+   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
+   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
+   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
+   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
+   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
+   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
+   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
+   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
+   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
+   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
+   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
   
    return mask;
 }
 
 
 static inline void
-build_masks(int64_t c,
-            int64_t cdiff,
-            int64_t dcdx,
-            int64_t dcdy,
-	    unsigned *outmask,
-	    unsigned *partmask)
+build_masks(int32_t c,
+            int32_t cdiff,
+            int32_t dcdx,
+            int32_t dcdy,
+            unsigned *outmask,
+            unsigned *partmask)
 {
    *outmask |= build_mask_linear(c, dcdx, dcdy);
    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
@@ -140,12 +140,12 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
 
 
 static inline void
-build_masks_32(int c, 
-               int cdiff,
-               int dcdx,
-               int dcdy,
-               unsigned *outmask,
-               unsigned *partmask)
+build_masks_sse(int c,
+                int cdiff,
+                int dcdx,
+                int dcdy,
+                unsigned *outmask,
+                unsigned *partmask)
 {
    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    __m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -186,7 +186,7 @@ build_masks_32(int c,
 
 
 static inline unsigned
-build_mask_linear_32(int c, int dcdx, int dcdy)
+build_mask_linear_sse(int c, int dcdx, int dcdy)
 {
    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    __m128i xdcdy = _mm_set1_epi32(dcdy);
@@ -442,12 +442,12 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #include "util/u_pwr8.h"
 
 static inline void
-build_masks_32(int c,
-               int cdiff,
-               int dcdx,
-               int dcdy,
-               unsigned *outmask,
-               unsigned *partmask)
+build_masks_ppc(int c,
+                int cdiff,
+                int dcdx,
+                int dcdy,
+                unsigned *outmask,
+                unsigned *partmask)
 {
    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    __m128i xdcdy = (__m128i) vec_splats(dcdy);
@@ -487,7 +487,7 @@ build_masks_32(int c,
 }
 
 static inline unsigned
-build_mask_linear_32(int c, int dcdx, int dcdy)
+build_mask_linear_ppc(int c, int dcdx, int dcdy)
 {
    __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
    __m128i xdcdy = (__m128i) vec_splats(dcdy);
@@ -684,8 +684,18 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #endif
 
 
+#if defined PIPE_ARCH_SSE
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
+#elif (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
+#else
 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
+#endif
+
+#define RASTER_64 1
 
 #define TAG(x) x##_1
 #define NR_PLANES 1
@@ -722,12 +732,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #define NR_PLANES 8
 #include "lp_rast_tri_tmp.h"
 
-#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
-#undef BUILD_MASKS
-#undef BUILD_MASK_LINEAR
-#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
-#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
-#endif
+#undef RASTER_64
 
 #define TAG(x) x##_32_1
 #define NR_PLANES 1
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index e0aea94205e..05d2242786f 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -50,9 +50,15 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
    int j;
 
    for (j = 0; j < NR_PLANES; j++) {
-      mask &= ~BUILD_MASK_LINEAR(c[j] - 1, 
-				 -plane[j].dcdx,
-				 plane[j].dcdy);
+#ifdef RASTER_64
+      mask &= ~BUILD_MASK_LINEAR(((c[j] - 1) >> (int64_t)FIXED_ORDER),
+                                 -plane[j].dcdx >> FIXED_ORDER,
+                                 plane[j].dcdy >> FIXED_ORDER);
+#else
+      mask &= ~BUILD_MASK_LINEAR((c[j] - 1),
+                                 -plane[j].dcdx,
+                                 plane[j].dcdy);
+#endif
    }
 
    /* Now pass to the shader:
@@ -79,17 +85,33 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
    partmask = 0;                /* outside one or more trivial accept planes */
 
    for (j = 0; j < NR_PLANES; j++) {
+#ifdef RASTER_64
+      int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
+      int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
+      const int32_t cox = plane[j].eo >> FIXED_ORDER;
+      const int32_t ei = (dcdy + dcdx - cox) << 2;
+      const int32_t cox_s = cox << 2;
+      const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
+      int32_t cdiff;
+      cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
+                            (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
+      dcdx <<= 2;
+      dcdy <<= 2;
+#else
       const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
       const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
       const int64_t cox = IMUL64(plane[j].eo, 4);
-      const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
+      const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
       const int64_t cio = IMUL64(ei, 4) - 1;
+      int32_t co, cdiff;
+      co = c[j] + cox;
+      cdiff = cio - cox;
+#endif
 
-      BUILD_MASKS(c[j] + cox,
-		  cio - cox,
-		  dcdx, dcdy, 
-		  &outmask,   /* sign bits from c[i][0..15] + cox */
-		  &partmask); /* sign bits from c[i][0..15] + cio */
+      BUILD_MASKS(co, cdiff,
+                  dcdx, dcdy,
+                  &outmask,   /* sign bits from c[i][0..15] + cox */
+                  &partmask); /* sign bits from c[i][0..15] + cio */
    }
 
    if (outmask == 0xffff)
@@ -179,14 +201,65 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
       c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x);
 
       {
-         const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
-         const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
-         const int64_t cox = IMUL64(plane[j].eo, 16);
-         const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
-         const int64_t cio = IMUL64(ei, 16) - 1;
-
-         BUILD_MASKS(c[j] + cox,
-                     cio - cox,
+#ifdef RASTER_64
+         /*
+          * Strip off lower FIXED_ORDER bits. Note that those bits from
+          * dcdx, dcdy, eo are always 0 (by definition).
+          * c values, however, are not. This means that for every
+          * addition of the form c + n*dcdx the lower FIXED_ORDER bits will
+          * NOT change. And those bits are not relevant to the sign bit (which
+          * is only what we need!) that is,
+          * sign(c + n*dcdx) == sign((c >> FIXED_ORDER) + n*(dcdx >> FIXED_ORDER))
+          * This means we can get away with using 32bit math for the most part.
+          * Only tricky part is the -1 adjustment for cdiff.
+          */
+         int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
+         int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
+         const int32_t cox = plane[j].eo >> FIXED_ORDER;
+         const int32_t ei = (dcdy + dcdx - cox) << 4;
+         const int32_t cox_s = cox << 4;
+         const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
+         int32_t cdiff;
+         /*
+          * Plausibility check to ensure the 32bit math works.
+          * Note that within a tile, the max we can move the edge function
+          * is essentially dcdx * TILE_SIZE + dcdy * TILE_SIZE.
+          * TILE_SIZE is 64, dcdx/dcdy are nominally 21 bit (for 8192 max size
+          * and 8 subpixel bits), I'd be happy with 2 bits more too (1 for
+          * increasing fb size to 16384, the required d3d11 value, another one
+          * because I'm not quite sure we can't be _just_ above the max value
+          * here). This gives us 30 bits max - hence if c would exceed that here
+          * that means the plane is either trivial reject for the whole tile
+          * (in which case the tri will not get binned), or trivial accept for
+          * the whole tile (in which case plane_mask will not include it).
+          */
+         assert((c[j] >> (int64_t)FIXED_ORDER) > (int32_t)0xb0000000 &&
+                (c[j] >> (int64_t)FIXED_ORDER) < (int32_t)0x3fffffff);
+         /*
+          * Note the fixup part is constant throughout the tile - thus could
+          * just calculate this and avoid _all_ 64bit math in rasterization
+          * (except exactly this fixup calc).
+          * In fact theoretically could move that even to setup, albeit that
+          * seems tricky (pre-bin certainly can have values larger than 32bit,
+          * and would need to communicate that fixup value through).
+          * And if we want to support msaa, we'd probably don't want to do the
+          * downscaling in setup in any case...
+          */
+         cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
+                               (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
+         dcdx <<= 4;
+         dcdy <<= 4;
+#else
+         const int32_t dcdx = -plane[j].dcdx << 4;
+         const int32_t dcdy = plane[j].dcdy << 4;
+         const int32_t cox = plane[j].eo << 4;
+         const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int32_t)plane[j].eo;
+         const int32_t cio = (ei << 4) - 1;
+         int32_t co, cdiff;
+         co = c[j] + cox;
+         cdiff = cio - cox;
+#endif
+         BUILD_MASKS(co, cdiff,
                      dcdx, dcdy,
                      &outmask,   /* sign bits from c[i][0..15] + cox */
                      &partmask); /* sign bits from c[i][0..15] + cio */
-- 
2.30.2