From: Roland Scheidegger Date: Sun, 31 Jan 2016 00:26:59 +0000 (+0100) Subject: llvmpipe: use vector loads for (optimized) tri raster funcs X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8aa168eb8f0f1f634d0fc6733812c70d3a597518;p=mesa.git llvmpipe: use vector loads for (optimized) tri raster funcs When we switched to 64bit rasterization, we could no longer use straight aligned loads for loading the plane data. However, what the code actually does for loading 3 planes, is 12 scalar loads + 9 unpacks, and then there's another 8 unpacks for the transpose we need (!). It would be possible to do the (scalar) loads of course already transposed (at least saving the additional unpacks), however instead just use (un)aligned vector loads, and recalculate the eo values, which is much less instructions (note in case of the triangle_32_3_4 case, the eo values are not even used, making the scalar loads + unpacks for them all the more pointless). This drops execution time of the triangle_32_3_4 function considerably, albeit it doesn't really make a measurable difference (for small tris we're essentially limited by vertex throughput in any case), for triangle_32_3_16 it's essentially noise (the loop is more costly than the initial code there). (I'm thinking about just ditching storing the eo values in the plane data, so could switch back to using aligned planes, however right now they are still used in the other raster functions dealing with planes with scalar code. Also not touching the ppc code, might not be that bad there in any case.) Reviewed-by: Brian Paul --- diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index db45cbbb057..34008e1c01e 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -308,17 +308,4 @@ void lp_debug_draw_bins_by_coverage( struct lp_scene *scene ); -#ifdef PIPE_ARCH_SSE -#include -#include "util/u_sse.h" - -static inline __m128i -lp_plane_to_m128i(const struct lp_rast_plane *plane) -{ - return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, - (int32_t)plane->dcdy, (int32_t)plane->eo); -} - -#endif - #endif diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 0ae6ec28d35..f4a2f0268f0 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff) void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) + const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); @@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; unsigned nr = 0; - __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ - __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ - __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ + /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ + __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); __m128i zero = _mm_setzero_si128(); - __m128i c; - __m128i dcdx; - __m128i dcdy; - __m128i rej4; - - __m128i dcdx2; - __m128i dcdx3; + __m128i c, dcdx, dcdy, rej4; + __m128i dcdx_neg_mask, dcdy_neg_mask; + __m128i dcdx2, dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; - + transpose4_epi32(&p0, &p1, &p2, &zero, - &c, &dcdx, &dcdy, &rej4); + &c, &unused, &dcdx, &dcdy); + + /* recalc eo - easier than trying to load as scalars / shuffle... */ + dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); + dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); + rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), + _mm_and_si128(dcdx_neg_mask, dcdx)); /* Adjust dcdx; */ @@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) + const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; unsigned y = (arg.triangle.plane_mask >> 8) + task->y; - __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ - __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ - __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ + /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ + __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ + __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); + __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); __m128i zero = _mm_setzero_si128(); - __m128i c; - __m128i dcdx; - __m128i dcdy; + __m128i c, dcdx, dcdy; + __m128i dcdx2, dcdx3; - __m128i dcdx2; - __m128i dcdx3; - __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, - &c, &dcdx, &dcdy, &unused); + &c, &unused, &dcdx, &dcdy); /* Adjust dcdx; */