From 0aa3a09ced07e150901cd0f7a7917556a018c252 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 22 Aug 2010 22:56:54 +0100 Subject: [PATCH] llvmpipe: combine linear mask calculation --- src/gallium/drivers/llvmpipe/lp_rast_tri.c | 73 +++++++++++++++++-- .../drivers/llvmpipe/lp_rast_tri_tmp.h | 26 ++++--- 2 files changed, 84 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 5b3ad6e0a78..bdb8d131ccd 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -128,11 +128,71 @@ build_mask_linear(int c, int dcdx, int dcdy) return mask; } + + +static INLINE void +build_masks(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) +{ + *outmask |= build_mask_linear(c, dcdx, dcdy); + *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); +} + #else #include #include "util/u_sse.h" +static INLINE void +build_masks(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) +{ + __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = _mm_set1_epi32(dcdy); + + /* Get values across the quad + */ + __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); + __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); + __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); + + { + __m128i cstep01, cstep23, result; + + cstep01 = _mm_packs_epi32(cstep0, cstep1); + cstep23 = _mm_packs_epi32(cstep2, cstep3); + result = _mm_packs_epi16(cstep01, cstep23); + + *outmask |= _mm_movemask_epi8(result); + } + + + { + __m128i cio4 = _mm_set1_epi32(cdiff); + __m128i cstep01, cstep23, result; + + cstep0 = _mm_add_epi32(cstep0, cio4); + cstep1 = _mm_add_epi32(cstep1, cio4); + cstep2 = _mm_add_epi32(cstep2, cio4); + cstep3 = _mm_add_epi32(cstep3, cio4); + + cstep01 = _mm_packs_epi32(cstep0, cstep1); + cstep23 = _mm_packs_epi32(cstep2, cstep3); + result = _mm_packs_epi16(cstep01, cstep23); + + *partmask |= _mm_movemask_epi8(result); + } +} + + static INLINE unsigned build_mask_linear(int c, int dcdx, int dcdy) { @@ -263,11 +323,14 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task, { const int dcdx = -plane[j].dcdx * 4; const int dcdy = plane[j].dcdy * 4; - const int cox = c[j] + plane[j].eo * 4; - const int cio = c[j] + plane[j].ei * 4 - 1; - - outmask |= build_mask_linear(cox, dcdx, dcdy); - partmask |= build_mask_linear(cio, dcdx, dcdy); + const int cox = plane[j].eo * 4; + const int cio = plane[j].ei * 4 - 1; + + build_masks(c[j] + cox, + cio - cox, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ } } diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h index 0def5f72436..99a0bae45db 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -81,11 +81,14 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, for (j = 0; j < NR_PLANES; j++) { const int dcdx = -plane[j].dcdx * 4; const int dcdy = plane[j].dcdy * 4; - const int cox = c[j] + plane[j].eo * 4; - const int cio = c[j] + plane[j].ei * 4 - 1; - - outmask |= build_mask_linear(cox, dcdx, dcdy); - partmask |= build_mask_linear(cio, dcdx, dcdy); + const int cox = plane[j].eo * 4; + const int cio = plane[j].ei * 4 - 1; + + build_masks(c[j] + cox, + cio - cox, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ } if (outmask == 0xffff) @@ -171,11 +174,14 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, { const int dcdx = -plane[j].dcdx * 16; const int dcdy = plane[j].dcdy * 16; - const int cox = c[j] + plane[j].eo * 16; - const int cio = c[j] + plane[j].ei * 16 - 1; - - outmask |= build_mask_linear(cox, dcdx, dcdy); - partmask |= build_mask_linear(cio, dcdx, dcdy); + const int cox = plane[j].eo * 16; + const int cio = plane[j].ei * 16 - 1; + + build_masks(c[j] + cox, + cio - cox, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ } j++; -- 2.30.2