llvmpipe: combine linear mask calculation
authorKeith Whitwell <keithw@vmware.com>
Sun, 22 Aug 2010 21:56:54 +0000 (22:56 +0100)
committerKeith Whitwell <keithw@vmware.com>
Tue, 31 Aug 2010 22:31:45 +0000 (23:31 +0100)
src/gallium/drivers/llvmpipe/lp_rast_tri.c
src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h

index 5b3ad6e0a784412aa8c3fadd8425bfd15f9adcf7..bdb8d131ccddebcc7fc3611625aa8ed4f1f64e85 100644 (file)
@@ -128,11 +128,71 @@ build_mask_linear(int c, int dcdx, int dcdy)
   
    return mask;
 }
+
+
+static INLINE void
+build_masks(int c, 
+           int cdiff,
+           int dcdx,
+           int dcdy,
+           unsigned *outmask,
+           unsigned *partmask)
+{
+   *outmask |= build_mask_linear(c, dcdx, dcdy);
+   *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
+}
+
 #else
 #include <emmintrin.h>
 #include "util/u_sse.h"
 
 
+static INLINE void
+build_masks(int c, 
+           int cdiff,
+           int dcdx,
+           int dcdy,
+           unsigned *outmask,
+           unsigned *partmask)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *outmask |= _mm_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = _mm_set1_epi32(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = _mm_add_epi32(cstep0, cio4);
+      cstep1 = _mm_add_epi32(cstep1, cio4);
+      cstep2 = _mm_add_epi32(cstep2, cio4);
+      cstep3 = _mm_add_epi32(cstep3, cio4);
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *partmask |= _mm_movemask_epi8(result);
+   }
+}
+
+
 static INLINE unsigned
 build_mask_linear(int c, int dcdx, int dcdy)
 {
@@ -263,11 +323,14 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
       {
         const int dcdx = -plane[j].dcdx * 4;
         const int dcdy = plane[j].dcdy * 4;
-        const int cox = c[j] + plane[j].eo * 4;
-        const int cio = c[j] + plane[j].ei * 4 - 1;
-
-        outmask |= build_mask_linear(cox, dcdx, dcdy);
-        partmask |= build_mask_linear(cio, dcdx, dcdy);
+        const int cox = plane[j].eo * 4;
+        const int cio = plane[j].ei * 4 - 1;
+
+        build_masks(c[j] + cox,
+                    cio - cox,
+                    dcdx, dcdy, 
+                    &outmask,   /* sign bits from c[i][0..15] + cox */
+                    &partmask); /* sign bits from c[i][0..15] + cio */
       }
    }
 
index 0def5f724362b3037558e9504f90a4327116db9e..99a0bae45dba417bc7d0e6219a476eae5c3da966 100644 (file)
@@ -81,11 +81,14 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
    for (j = 0; j < NR_PLANES; j++) {
       const int dcdx = -plane[j].dcdx * 4;
       const int dcdy = plane[j].dcdy * 4;
-      const int cox = c[j] + plane[j].eo * 4;
-      const int cio = c[j] + plane[j].ei * 4 - 1;
-
-      outmask |= build_mask_linear(cox, dcdx, dcdy);
-      partmask |= build_mask_linear(cio, dcdx, dcdy);
+      const int cox = plane[j].eo * 4;
+      const int cio = plane[j].ei * 4 - 1;
+
+      build_masks(c[j] + cox,
+                 cio - cox,
+                 dcdx, dcdy, 
+                 &outmask,   /* sign bits from c[i][0..15] + cox */
+                 &partmask); /* sign bits from c[i][0..15] + cio */
    }
 
    if (outmask == 0xffff)
@@ -171,11 +174,14 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
       {
         const int dcdx = -plane[j].dcdx * 16;
         const int dcdy = plane[j].dcdy * 16;
-        const int cox = c[j] + plane[j].eo * 16;
-        const int cio = c[j] + plane[j].ei * 16 - 1;
-
-        outmask |= build_mask_linear(cox, dcdx, dcdy);
-        partmask |= build_mask_linear(cio, dcdx, dcdy);
+        const int cox = plane[j].eo * 16;
+        const int cio = plane[j].ei * 16 - 1;
+
+        build_masks(c[j] + cox,
+                    cio - cox,
+                    dcdx, dcdy, 
+                    &outmask,   /* sign bits from c[i][0..15] + cox */
+                    &partmask); /* sign bits from c[i][0..15] + cio */
       }
 
       j++;