X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fllvmpipe%2Flp_rast_tri.c;h=f4a2f0268f0ec9247ffdbcdd703efced91dc904a;hb=01ab218bbc5c8058a99077a6bc3dc9884e9d218a;hp=232c8599e42d3d202b2247891ed764e87b3da09c;hpb=9d59b9d00cdb1e0e8bd139fba5250df869727386;p=mesa.git diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 232c8599e42..f4a2f0268f0 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -64,43 +64,43 @@ block_full_16(struct lp_rasterizer_task *task, } static inline unsigned -build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy) +build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy) { unsigned mask = 0; - int64_t c0 = c; - int64_t c1 = c0 + dcdy; - int64_t c2 = c1 + dcdy; - int64_t c3 = c2 + dcdy; - - mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0); - mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1); - mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2); - mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3); - mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4); - mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5); - mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6); - mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7); - mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8); - mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9); - mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10); - mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11); - mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12); - mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13); - mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14); - mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15); + int32_t c0 = c; + int32_t c1 = c0 + dcdy; + int32_t c2 = c1 + dcdy; + int32_t c3 = c2 + dcdy; + + mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); + mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); + mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); + mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); + mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); + mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); + mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); + mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); + mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); + mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); + mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); + mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); + mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); + mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); + mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); + mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); return mask; } static inline void -build_masks(int64_t c, - int64_t cdiff, - int64_t dcdx, - int64_t dcdy, - unsigned *outmask, - unsigned *partmask) +build_masks(int32_t c, + int32_t cdiff, + int32_t dcdx, + int32_t dcdy, + unsigned *outmask, + unsigned *partmask) { *outmask |= build_mask_linear(c, dcdx, dcdy); *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); @@ -140,12 +140,12 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task, static inline void -build_masks_32(int c, - int cdiff, - int dcdx, - int dcdy, - unsigned *outmask, - unsigned *partmask) +build_masks_sse(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = _mm_set1_epi32(dcdy); @@ -186,7 +186,7 @@ build_masks_32(int c, static inline unsigned -build_mask_linear_32(int c, int dcdx, int dcdy) +build_mask_linear_sse(int c, int dcdx, int dcdy) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = _mm_set1_epi32(dcdy); @@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff) void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) + const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); @@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; unsigned nr = 0; - __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ - __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ - __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ + /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ + __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); __m128i zero = _mm_setzero_si128(); - __m128i c; - __m128i dcdx; - __m128i dcdy; - __m128i rej4; - - __m128i dcdx2; - __m128i dcdx3; + __m128i c, dcdx, dcdy, rej4; + __m128i dcdx_neg_mask, dcdy_neg_mask; + __m128i dcdx2, dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; - + transpose4_epi32(&p0, &p1, &p2, &zero, - &c, &dcdx, &dcdy, &rej4); + &c, &unused, &dcdx, &dcdy); + + /* recalc eo - easier than trying to load as scalars / shuffle... */ + dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); + dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); + rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), + _mm_and_si128(dcdx_neg_mask, dcdx)); /* Adjust dcdx; */ @@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) + const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; unsigned y = (arg.triangle.plane_mask >> 8) + task->y; - __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ - __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ - __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ + /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ + __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); __m128i zero = _mm_setzero_si128(); - __m128i c; - __m128i dcdx; - __m128i dcdy; + __m128i c, dcdx, dcdy; + __m128i dcdx2, dcdx3; - __m128i dcdx2; - __m128i dcdx3; - __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, - &c, &dcdx, &dcdy, &unused); + &c, &unused, &dcdx, &dcdy); /* Adjust dcdx; */ @@ -442,12 +442,12 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, #include "util/u_pwr8.h" static inline void -build_masks_32(int c, - int cdiff, - int dcdx, - int dcdy, - unsigned *outmask, - unsigned *partmask) +build_masks_ppc(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) { __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = (__m128i) vec_splats(dcdy); @@ -487,7 +487,7 @@ build_masks_32(int c, } static inline unsigned -build_mask_linear_32(int c, int dcdx, int dcdy) +build_mask_linear_ppc(int c, int dcdx, int dcdy) { __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = (__m128i) vec_splats(dcdy); @@ -684,8 +684,18 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, #endif +#if defined PIPE_ARCH_SSE +#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) +#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy) +#elif (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)) +#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) +#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy) +#else #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask) #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy) +#endif + +#define RASTER_64 1 #define TAG(x) x##_1 #define NR_PLANES 1 @@ -722,12 +732,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, #define NR_PLANES 8 #include "lp_rast_tri_tmp.h" -#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)) -#undef BUILD_MASKS -#undef BUILD_MASK_LINEAR -#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) -#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy) -#endif +#undef RASTER_64 #define TAG(x) x##_32_1 #define NR_PLANES 1