From d8c92a1eea555f8b9d673a3f2a708de5faf8b3cd Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sun, 15 Aug 2010 18:19:52 +0100
Subject: [PATCH] llvmpipe: intrinsics versions of build_mask functions

---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 78 +++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 8d729c74818..5b3ad6e0a78 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -67,7 +67,7 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
-
+#if !defined(PIPE_ARCH_SSE)
 static INLINE unsigned
 build_mask(int c, int dcdx, int dcdy)
 {
@@ -98,6 +98,7 @@ build_mask(int c, int dcdx, int dcdy)
    return mask;
 }
 
+
 static INLINE unsigned
 build_mask_linear(int c, int dcdx, int dcdy)
 {
@@ -127,6 +128,81 @@ build_mask_linear(int c, int dcdx, int dcdy)
   
    return mask;
 }
+#else
+#include <emmintrin.h>
+#include "util/u_sse.h"
+
+
+static INLINE unsigned
+build_mask_linear(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return _mm_movemask_epi8(result);
+}
+
+static INLINE unsigned
+build_mask(int c, int dcdx, int dcdy)
+{
+   __m128i step = _mm_setr_epi32(0, dcdx, dcdy, dcdx + dcdy);
+   __m128i c0 = _mm_set1_epi32(c);
+
+   /* Get values across the quad
+    */
+   __m128i cstep0 = _mm_add_epi32(c0, step);
+
+   /* Scale up step for moving between quads.  This should probably
+    * be an arithmetic shift left, but there doesn't seem to be
+    * such a thing in SSE.  It's unlikely that the step value is
+    * going to be large enough to overflow across 4 pixels, though
+    * if it is that big, rendering will be incorrect anyway.
+    */
+   __m128i step4 = _mm_slli_epi32(step, 1);
+
+   /* Get values for the remaining quads:
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, 
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
+   __m128i cstep2 = _mm_add_epi32(cstep0,
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(2,2,2,2)));
+   __m128i cstep3 = _mm_add_epi32(cstep2,
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return _mm_movemask_epi8(result);
+}
+
+#endif
+
+
 
 
 #define TAG(x) x##_1
-- 
2.30.2