1 /**************************************************************************
3 * Copyright 2007-2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 * Rasterization for binned triangles within a tile
33 #include "util/u_math.h"
36 #include "lp_rast_priv.h"
39 * Shade all pixels in a 4x4 block.
42 block_full_4(struct lp_rasterizer_task
*task
,
43 const struct lp_rast_triangle
*tri
,
46 lp_rast_shade_quads_all(task
, &tri
->inputs
, x
, y
);
51 * Shade all pixels in a 16x16 block.
54 block_full_16(struct lp_rasterizer_task
*task
,
55 const struct lp_rast_triangle
*tri
,
61 for (iy
= 0; iy
< 16; iy
+= 4)
62 for (ix
= 0; ix
< 16; ix
+= 4)
63 block_full_4(task
, tri
, x
+ ix
, y
+ iy
);
66 static inline unsigned
67 build_mask_linear(int32_t c
, int32_t dcdx
, int32_t dcdy
)
72 int32_t c1
= c0
+ dcdy
;
73 int32_t c2
= c1
+ dcdy
;
74 int32_t c3
= c2
+ dcdy
;
76 mask
|= ((c0
+ 0 * dcdx
) >> 31) & (1 << 0);
77 mask
|= ((c0
+ 1 * dcdx
) >> 31) & (1 << 1);
78 mask
|= ((c0
+ 2 * dcdx
) >> 31) & (1 << 2);
79 mask
|= ((c0
+ 3 * dcdx
) >> 31) & (1 << 3);
80 mask
|= ((c1
+ 0 * dcdx
) >> 31) & (1 << 4);
81 mask
|= ((c1
+ 1 * dcdx
) >> 31) & (1 << 5);
82 mask
|= ((c1
+ 2 * dcdx
) >> 31) & (1 << 6);
83 mask
|= ((c1
+ 3 * dcdx
) >> 31) & (1 << 7);
84 mask
|= ((c2
+ 0 * dcdx
) >> 31) & (1 << 8);
85 mask
|= ((c2
+ 1 * dcdx
) >> 31) & (1 << 9);
86 mask
|= ((c2
+ 2 * dcdx
) >> 31) & (1 << 10);
87 mask
|= ((c2
+ 3 * dcdx
) >> 31) & (1 << 11);
88 mask
|= ((c3
+ 0 * dcdx
) >> 31) & (1 << 12);
89 mask
|= ((c3
+ 1 * dcdx
) >> 31) & (1 << 13);
90 mask
|= ((c3
+ 2 * dcdx
) >> 31) & (1 << 14);
91 mask
|= ((c3
+ 3 * dcdx
) >> 31) & (1 << 15);
98 build_masks(int32_t c
,
105 *outmask
|= build_mask_linear(c
, dcdx
, dcdy
);
106 *partmask
|= build_mask_linear(c
+ cdiff
, dcdx
, dcdy
);
110 lp_rast_triangle_3_16(struct lp_rasterizer_task
*task
,
111 const union lp_rast_cmd_arg arg
)
113 union lp_rast_cmd_arg arg2
;
114 arg2
.triangle
.tri
= arg
.triangle
.tri
;
115 arg2
.triangle
.plane_mask
= (1<<3)-1;
116 lp_rast_triangle_3(task
, arg2
);
120 lp_rast_triangle_3_4(struct lp_rasterizer_task
*task
,
121 const union lp_rast_cmd_arg arg
)
123 lp_rast_triangle_3_16(task
, arg
);
127 lp_rast_triangle_4_16(struct lp_rasterizer_task
*task
,
128 const union lp_rast_cmd_arg arg
)
130 union lp_rast_cmd_arg arg2
;
131 arg2
.triangle
.tri
= arg
.triangle
.tri
;
132 arg2
.triangle
.plane_mask
= (1<<4)-1;
133 lp_rast_triangle_4(task
, arg2
);
137 lp_rast_triangle_ms_3_16(struct lp_rasterizer_task
*task
,
138 const union lp_rast_cmd_arg arg
)
140 union lp_rast_cmd_arg arg2
;
141 arg2
.triangle
.tri
= arg
.triangle
.tri
;
142 arg2
.triangle
.plane_mask
= (1<<3)-1;
143 lp_rast_triangle_ms_3(task
, arg2
);
147 lp_rast_triangle_ms_3_4(struct lp_rasterizer_task
*task
,
148 const union lp_rast_cmd_arg arg
)
150 lp_rast_triangle_ms_3_16(task
, arg
);
154 lp_rast_triangle_ms_4_16(struct lp_rasterizer_task
*task
,
155 const union lp_rast_cmd_arg arg
)
157 union lp_rast_cmd_arg arg2
;
158 arg2
.triangle
.tri
= arg
.triangle
.tri
;
159 arg2
.triangle
.plane_mask
= (1<<4)-1;
160 lp_rast_triangle_ms_4(task
, arg2
);
163 #if defined(PIPE_ARCH_SSE)
165 #include <emmintrin.h>
166 #include "util/u_sse.h"
170 build_masks_sse(int c
,
177 __m128i cstep0
= _mm_setr_epi32(c
, c
+dcdx
, c
+dcdx
*2, c
+dcdx
*3);
178 __m128i xdcdy
= _mm_set1_epi32(dcdy
);
180 /* Get values across the quad
182 __m128i cstep1
= _mm_add_epi32(cstep0
, xdcdy
);
183 __m128i cstep2
= _mm_add_epi32(cstep1
, xdcdy
);
184 __m128i cstep3
= _mm_add_epi32(cstep2
, xdcdy
);
187 __m128i cstep01
, cstep23
, result
;
189 cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
190 cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
191 result
= _mm_packs_epi16(cstep01
, cstep23
);
193 *outmask
|= _mm_movemask_epi8(result
);
198 __m128i cio4
= _mm_set1_epi32(cdiff
);
199 __m128i cstep01
, cstep23
, result
;
201 cstep0
= _mm_add_epi32(cstep0
, cio4
);
202 cstep1
= _mm_add_epi32(cstep1
, cio4
);
203 cstep2
= _mm_add_epi32(cstep2
, cio4
);
204 cstep3
= _mm_add_epi32(cstep3
, cio4
);
206 cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
207 cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
208 result
= _mm_packs_epi16(cstep01
, cstep23
);
210 *partmask
|= _mm_movemask_epi8(result
);
215 static inline unsigned
216 build_mask_linear_sse(int c
, int dcdx
, int dcdy
)
218 __m128i cstep0
= _mm_setr_epi32(c
, c
+dcdx
, c
+dcdx
*2, c
+dcdx
*3);
219 __m128i xdcdy
= _mm_set1_epi32(dcdy
);
221 /* Get values across the quad
223 __m128i cstep1
= _mm_add_epi32(cstep0
, xdcdy
);
224 __m128i cstep2
= _mm_add_epi32(cstep1
, xdcdy
);
225 __m128i cstep3
= _mm_add_epi32(cstep2
, xdcdy
);
227 /* pack pairs of results into epi16
229 __m128i cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
230 __m128i cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
232 /* pack into epi8, preserving sign bits
234 __m128i result
= _mm_packs_epi16(cstep01
, cstep23
);
236 /* extract sign bits to create mask
238 return _mm_movemask_epi8(result
);
241 static inline unsigned
242 sign_bits4(const __m128i
*cstep
, int cdiff
)
245 /* Adjust the step values
247 __m128i cio4
= _mm_set1_epi32(cdiff
);
248 __m128i cstep0
= _mm_add_epi32(cstep
[0], cio4
);
249 __m128i cstep1
= _mm_add_epi32(cstep
[1], cio4
);
250 __m128i cstep2
= _mm_add_epi32(cstep
[2], cio4
);
251 __m128i cstep3
= _mm_add_epi32(cstep
[3], cio4
);
255 __m128i cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
256 __m128i cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
257 __m128i result
= _mm_packs_epi16(cstep01
, cstep23
);
259 /* Extract the sign bits
261 return _mm_movemask_epi8(result
);
268 lp_rast_triangle_32_3_16(struct lp_rasterizer_task
*task
,
269 const union lp_rast_cmd_arg arg
)
271 const struct lp_rast_triangle
*tri
= arg
.triangle
.tri
;
272 const struct lp_rast_plane
*plane
= GET_PLANES(tri
);
273 int x
= (arg
.triangle
.plane_mask
& 0xff) + task
->x
;
274 int y
= (arg
.triangle
.plane_mask
>> 8) + task
->y
;
277 struct { unsigned mask
:16; unsigned i
:8; unsigned j
:8; } out
[16];
280 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
281 __m128i p0
= _mm_load_si128((__m128i
*)&plane
[0]); /* clo, chi, dcdx, dcdy */
282 __m128i p1
= _mm_loadu_si128((__m128i
*)&plane
[1]);
283 __m128i p2
= _mm_load_si128((__m128i
*)&plane
[2]);
284 __m128i zero
= _mm_setzero_si128();
286 __m128i c
, dcdx
, dcdy
, rej4
;
287 __m128i dcdx_neg_mask
, dcdy_neg_mask
;
288 __m128i dcdx2
, dcdx3
;
290 __m128i span_0
; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
291 __m128i span_1
; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
292 __m128i span_2
; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
295 transpose4_epi32(&p0
, &p1
, &p2
, &zero
,
296 &c
, &unused
, &dcdx
, &dcdy
);
298 /* recalc eo - easier than trying to load as scalars / shuffle... */
299 dcdx_neg_mask
= _mm_srai_epi32(dcdx
, 31);
300 dcdy_neg_mask
= _mm_srai_epi32(dcdy
, 31);
301 rej4
= _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask
, dcdy
),
302 _mm_and_si128(dcdx_neg_mask
, dcdx
));
306 dcdx
= _mm_sub_epi32(zero
, dcdx
);
308 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdx
, _mm_set1_epi32(x
)));
309 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdy
, _mm_set1_epi32(y
)));
310 rej4
= _mm_slli_epi32(rej4
, 2);
312 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
313 c
= _mm_sub_epi32(c
, _mm_set1_epi32(1));
314 rej4
= _mm_add_epi32(rej4
, _mm_set1_epi32(1));
316 dcdx2
= _mm_add_epi32(dcdx
, dcdx
);
317 dcdx3
= _mm_add_epi32(dcdx2
, dcdx
);
319 transpose4_epi32(&zero
, &dcdx
, &dcdx2
, &dcdx3
,
320 &span_0
, &span_1
, &span_2
, &unused
);
322 for (i
= 0; i
< 4; i
++) {
325 for (j
= 0; j
< 4; j
++) {
326 __m128i c4rej
= _mm_add_epi32(cx
, rej4
);
327 __m128i rej_masks
= _mm_srai_epi32(c4rej
, 31);
329 /* if (is_zero(rej_masks)) */
330 if (_mm_movemask_epi8(rej_masks
) == 0) {
331 __m128i c0_0
= _mm_add_epi32(SCALAR_EPI32(cx
, 0), span_0
);
332 __m128i c1_0
= _mm_add_epi32(SCALAR_EPI32(cx
, 1), span_1
);
333 __m128i c2_0
= _mm_add_epi32(SCALAR_EPI32(cx
, 2), span_2
);
335 __m128i c_0
= _mm_or_si128(_mm_or_si128(c0_0
, c1_0
), c2_0
);
337 __m128i c0_1
= _mm_add_epi32(c0_0
, SCALAR_EPI32(dcdy
, 0));
338 __m128i c1_1
= _mm_add_epi32(c1_0
, SCALAR_EPI32(dcdy
, 1));
339 __m128i c2_1
= _mm_add_epi32(c2_0
, SCALAR_EPI32(dcdy
, 2));
341 __m128i c_1
= _mm_or_si128(_mm_or_si128(c0_1
, c1_1
), c2_1
);
342 __m128i c_01
= _mm_packs_epi32(c_0
, c_1
);
344 __m128i c0_2
= _mm_add_epi32(c0_1
, SCALAR_EPI32(dcdy
, 0));
345 __m128i c1_2
= _mm_add_epi32(c1_1
, SCALAR_EPI32(dcdy
, 1));
346 __m128i c2_2
= _mm_add_epi32(c2_1
, SCALAR_EPI32(dcdy
, 2));
348 __m128i c_2
= _mm_or_si128(_mm_or_si128(c0_2
, c1_2
), c2_2
);
350 __m128i c0_3
= _mm_add_epi32(c0_2
, SCALAR_EPI32(dcdy
, 0));
351 __m128i c1_3
= _mm_add_epi32(c1_2
, SCALAR_EPI32(dcdy
, 1));
352 __m128i c2_3
= _mm_add_epi32(c2_2
, SCALAR_EPI32(dcdy
, 2));
354 __m128i c_3
= _mm_or_si128(_mm_or_si128(c0_3
, c1_3
), c2_3
);
355 __m128i c_23
= _mm_packs_epi32(c_2
, c_3
);
356 __m128i c_0123
= _mm_packs_epi16(c_01
, c_23
);
358 unsigned mask
= _mm_movemask_epi8(c_0123
);
366 cx
= _mm_add_epi32(cx
, _mm_slli_epi32(dcdx
, 2));
369 c
= _mm_add_epi32(c
, _mm_slli_epi32(dcdy
, 2));
372 for (i
= 0; i
< nr
; i
++)
373 lp_rast_shade_quads_mask(task
,
377 0xffff & ~out
[i
].mask
);
381 lp_rast_triangle_32_3_4(struct lp_rasterizer_task
*task
,
382 const union lp_rast_cmd_arg arg
)
384 const struct lp_rast_triangle
*tri
= arg
.triangle
.tri
;
385 const struct lp_rast_plane
*plane
= GET_PLANES(tri
);
386 unsigned x
= (arg
.triangle
.plane_mask
& 0xff) + task
->x
;
387 unsigned y
= (arg
.triangle
.plane_mask
>> 8) + task
->y
;
389 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
390 __m128i p0
= _mm_load_si128((__m128i
*)&plane
[0]); /* clo, chi, dcdx, dcdy */
391 __m128i p1
= _mm_loadu_si128((__m128i
*)&plane
[1]);
392 __m128i p2
= _mm_load_si128((__m128i
*)&plane
[2]);
393 __m128i zero
= _mm_setzero_si128();
395 __m128i c
, dcdx
, dcdy
;
396 __m128i dcdx2
, dcdx3
;
398 __m128i span_0
; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
399 __m128i span_1
; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
400 __m128i span_2
; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
403 transpose4_epi32(&p0
, &p1
, &p2
, &zero
,
404 &c
, &unused
, &dcdx
, &dcdy
);
408 dcdx
= _mm_sub_epi32(zero
, dcdx
);
410 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdx
, _mm_set1_epi32(x
)));
411 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdy
, _mm_set1_epi32(y
)));
413 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
414 c
= _mm_sub_epi32(c
, _mm_set1_epi32(1));
416 dcdx2
= _mm_add_epi32(dcdx
, dcdx
);
417 dcdx3
= _mm_add_epi32(dcdx2
, dcdx
);
419 transpose4_epi32(&zero
, &dcdx
, &dcdx2
, &dcdx3
,
420 &span_0
, &span_1
, &span_2
, &unused
);
424 __m128i c0_0
= _mm_add_epi32(SCALAR_EPI32(c
, 0), span_0
);
425 __m128i c1_0
= _mm_add_epi32(SCALAR_EPI32(c
, 1), span_1
);
426 __m128i c2_0
= _mm_add_epi32(SCALAR_EPI32(c
, 2), span_2
);
428 __m128i c_0
= _mm_or_si128(_mm_or_si128(c0_0
, c1_0
), c2_0
);
430 __m128i c0_1
= _mm_add_epi32(c0_0
, SCALAR_EPI32(dcdy
, 0));
431 __m128i c1_1
= _mm_add_epi32(c1_0
, SCALAR_EPI32(dcdy
, 1));
432 __m128i c2_1
= _mm_add_epi32(c2_0
, SCALAR_EPI32(dcdy
, 2));
434 __m128i c_1
= _mm_or_si128(_mm_or_si128(c0_1
, c1_1
), c2_1
);
435 __m128i c_01
= _mm_packs_epi32(c_0
, c_1
);
437 __m128i c0_2
= _mm_add_epi32(c0_1
, SCALAR_EPI32(dcdy
, 0));
438 __m128i c1_2
= _mm_add_epi32(c1_1
, SCALAR_EPI32(dcdy
, 1));
439 __m128i c2_2
= _mm_add_epi32(c2_1
, SCALAR_EPI32(dcdy
, 2));
441 __m128i c_2
= _mm_or_si128(_mm_or_si128(c0_2
, c1_2
), c2_2
);
443 __m128i c0_3
= _mm_add_epi32(c0_2
, SCALAR_EPI32(dcdy
, 0));
444 __m128i c1_3
= _mm_add_epi32(c1_2
, SCALAR_EPI32(dcdy
, 1));
445 __m128i c2_3
= _mm_add_epi32(c2_2
, SCALAR_EPI32(dcdy
, 2));
447 __m128i c_3
= _mm_or_si128(_mm_or_si128(c0_3
, c1_3
), c2_3
);
448 __m128i c_23
= _mm_packs_epi32(c_2
, c_3
);
449 __m128i c_0123
= _mm_packs_epi16(c_01
, c_23
);
451 unsigned mask
= _mm_movemask_epi8(c_0123
);
454 lp_rast_shade_quads_mask(task
,
466 #if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
469 #include "util/u_pwr8.h"
472 build_masks_ppc(int c
,
479 __m128i cstep0
= vec_setr_epi32(c
, c
+dcdx
, c
+dcdx
*2, c
+dcdx
*3);
480 __m128i xdcdy
= (__m128i
) vec_splats(dcdy
);
482 /* Get values across the quad
484 __m128i cstep1
= vec_add_epi32(cstep0
, xdcdy
);
485 __m128i cstep2
= vec_add_epi32(cstep1
, xdcdy
);
486 __m128i cstep3
= vec_add_epi32(cstep2
, xdcdy
);
489 __m128i cstep01
, cstep23
, result
;
491 cstep01
= vec_packs_epi32(cstep0
, cstep1
);
492 cstep23
= vec_packs_epi32(cstep2
, cstep3
);
493 result
= vec_packs_epi16(cstep01
, cstep23
);
495 *outmask
|= vec_movemask_epi8(result
);
500 __m128i cio4
= (__m128i
) vec_splats(cdiff
);
501 __m128i cstep01
, cstep23
, result
;
503 cstep0
= vec_add_epi32(cstep0
, cio4
);
504 cstep1
= vec_add_epi32(cstep1
, cio4
);
505 cstep2
= vec_add_epi32(cstep2
, cio4
);
506 cstep3
= vec_add_epi32(cstep3
, cio4
);
508 cstep01
= vec_packs_epi32(cstep0
, cstep1
);
509 cstep23
= vec_packs_epi32(cstep2
, cstep3
);
510 result
= vec_packs_epi16(cstep01
, cstep23
);
512 *partmask
|= vec_movemask_epi8(result
);
516 static inline unsigned
517 build_mask_linear_ppc(int c
, int dcdx
, int dcdy
)
519 __m128i cstep0
= vec_setr_epi32(c
, c
+dcdx
, c
+dcdx
*2, c
+dcdx
*3);
520 __m128i xdcdy
= (__m128i
) vec_splats(dcdy
);
522 /* Get values across the quad
524 __m128i cstep1
= vec_add_epi32(cstep0
, xdcdy
);
525 __m128i cstep2
= vec_add_epi32(cstep1
, xdcdy
);
526 __m128i cstep3
= vec_add_epi32(cstep2
, xdcdy
);
528 /* pack pairs of results into epi16
530 __m128i cstep01
= vec_packs_epi32(cstep0
, cstep1
);
531 __m128i cstep23
= vec_packs_epi32(cstep2
, cstep3
);
533 /* pack into epi8, preserving sign bits
535 __m128i result
= vec_packs_epi16(cstep01
, cstep23
);
537 /* extract sign bits to create mask
539 return vec_movemask_epi8(result
);
542 static inline __m128i
543 lp_plane_to_m128i(const struct lp_rast_plane
*plane
)
545 return vec_setr_epi32((int32_t)plane
->c
, (int32_t)plane
->dcdx
,
546 (int32_t)plane
->dcdy
, (int32_t)plane
->eo
);
552 lp_rast_triangle_32_3_16(struct lp_rasterizer_task
*task
,
553 const union lp_rast_cmd_arg arg
)
555 const struct lp_rast_triangle
*tri
= arg
.triangle
.tri
;
556 const struct lp_rast_plane
*plane
= GET_PLANES(tri
);
557 int x
= (arg
.triangle
.plane_mask
& 0xff) + task
->x
;
558 int y
= (arg
.triangle
.plane_mask
>> 8) + task
->y
;
561 struct { unsigned mask
:16; unsigned i
:8; unsigned j
:8; } out
[16];
564 __m128i p0
= lp_plane_to_m128i(&plane
[0]); /* c, dcdx, dcdy, eo */
565 __m128i p1
= lp_plane_to_m128i(&plane
[1]); /* c, dcdx, dcdy, eo */
566 __m128i p2
= lp_plane_to_m128i(&plane
[2]); /* c, dcdx, dcdy, eo */
567 __m128i zero
= vec_splats((unsigned char) 0);
577 __m128i span_0
; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
578 __m128i span_1
; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
579 __m128i span_2
; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
586 #if UTIL_ARCH_LITTLE_ENDIAN
587 vshuf_mask0
= (__m128i
) vec_splats((unsigned int) 0x03020100);
588 vshuf_mask1
= (__m128i
) vec_splats((unsigned int) 0x07060504);
589 vshuf_mask2
= (__m128i
) vec_splats((unsigned int) 0x0B0A0908);
591 vshuf_mask0
= (__m128i
) vec_splats((unsigned int) 0x0C0D0E0F);
592 vshuf_mask1
= (__m128i
) vec_splats((unsigned int) 0x08090A0B);
593 vshuf_mask2
= (__m128i
) vec_splats((unsigned int) 0x04050607);
596 transpose4_epi32(&p0
, &p1
, &p2
, &zero
,
597 &c
, &dcdx
, &dcdy
, &rej4
);
601 dcdx
= vec_sub_epi32(zero
, dcdx
);
603 c
= vec_add_epi32(c
, vec_mullo_epi32(dcdx
, (__m128i
) vec_splats(x
)));
604 c
= vec_add_epi32(c
, vec_mullo_epi32(dcdy
, (__m128i
) vec_splats(y
)));
605 rej4
= vec_slli_epi32(rej4
, 2);
608 * Adjust so we can just check the sign bit (< 0 comparison),
609 * instead of having to do a less efficient <= 0 comparison
611 c
= vec_sub_epi32(c
, (__m128i
) vec_splats((unsigned int) 1));
612 rej4
= vec_add_epi32(rej4
, (__m128i
) vec_splats((unsigned int) 1));
614 dcdx2
= vec_add_epi32(dcdx
, dcdx
);
615 dcdx3
= vec_add_epi32(dcdx2
, dcdx
);
617 transpose4_epi32(&zero
, &dcdx
, &dcdx2
, &dcdx3
,
618 &span_0
, &span_1
, &span_2
, &unused
);
620 for (i
= 0; i
< 4; i
++) {
623 for (j
= 0; j
< 4; j
++) {
624 __m128i c4rej
= vec_add_epi32(cx
, rej4
);
625 __m128i rej_masks
= vec_srai_epi32(c4rej
, 31);
627 /* if (is_zero(rej_masks)) */
628 if (vec_movemask_epi8(rej_masks
) == 0) {
629 __m128i c0_0
= vec_add_epi32(vec_perm(cx
, cx
, vshuf_mask0
), span_0
);
630 __m128i c1_0
= vec_add_epi32(vec_perm(cx
, cx
, vshuf_mask1
), span_1
);
631 __m128i c2_0
= vec_add_epi32(vec_perm(cx
, cx
, vshuf_mask2
), span_2
);
633 __m128i c_0
= vec_or(vec_or(c0_0
, c1_0
), c2_0
);
635 __m128i c0_1
= vec_add_epi32(c0_0
, vec_perm(dcdy
, dcdy
, vshuf_mask0
));
636 __m128i c1_1
= vec_add_epi32(c1_0
, vec_perm(dcdy
, dcdy
, vshuf_mask1
));
637 __m128i c2_1
= vec_add_epi32(c2_0
, vec_perm(dcdy
, dcdy
, vshuf_mask2
));
639 __m128i c_1
= vec_or(vec_or(c0_1
, c1_1
), c2_1
);
640 __m128i c_01
= vec_packs_epi32(c_0
, c_1
);
642 __m128i c0_2
= vec_add_epi32(c0_1
, vec_perm(dcdy
, dcdy
, vshuf_mask0
));
643 __m128i c1_2
= vec_add_epi32(c1_1
, vec_perm(dcdy
, dcdy
, vshuf_mask1
));
644 __m128i c2_2
= vec_add_epi32(c2_1
, vec_perm(dcdy
, dcdy
, vshuf_mask2
));
646 __m128i c_2
= vec_or(vec_or(c0_2
, c1_2
), c2_2
);
648 __m128i c0_3
= vec_add_epi32(c0_2
, vec_perm(dcdy
, dcdy
, vshuf_mask0
));
649 __m128i c1_3
= vec_add_epi32(c1_2
, vec_perm(dcdy
, dcdy
, vshuf_mask1
));
650 __m128i c2_3
= vec_add_epi32(c2_2
, vec_perm(dcdy
, dcdy
, vshuf_mask2
));
652 __m128i c_3
= vec_or(vec_or(c0_3
, c1_3
), c2_3
);
653 __m128i c_23
= vec_packs_epi32(c_2
, c_3
);
654 __m128i c_0123
= vec_packs_epi16(c_01
, c_23
);
656 unsigned mask
= vec_movemask_epi8(c_0123
);
664 cx
= vec_add_epi32(cx
, vec_slli_epi32(dcdx
, 2));
667 c
= vec_add_epi32(c
, vec_slli_epi32(dcdy
, 2));
670 for (i
= 0; i
< nr
; i
++)
671 lp_rast_shade_quads_mask(task
,
675 0xffff & ~out
[i
].mask
);
683 lp_rast_triangle_32_3_16(struct lp_rasterizer_task
*task
,
684 const union lp_rast_cmd_arg arg
)
686 union lp_rast_cmd_arg arg2
;
687 arg2
.triangle
.tri
= arg
.triangle
.tri
;
688 arg2
.triangle
.plane_mask
= (1<<3)-1;
689 lp_rast_triangle_32_3(task
, arg2
);
692 #endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */
695 lp_rast_triangle_32_4_16(struct lp_rasterizer_task
*task
,
696 const union lp_rast_cmd_arg arg
)
698 union lp_rast_cmd_arg arg2
;
699 arg2
.triangle
.tri
= arg
.triangle
.tri
;
700 arg2
.triangle
.plane_mask
= (1<<4)-1;
701 lp_rast_triangle_32_4(task
, arg2
);
705 lp_rast_triangle_32_3_4(struct lp_rasterizer_task
*task
,
706 const union lp_rast_cmd_arg arg
)
708 lp_rast_triangle_32_3_16(task
, arg
);
713 #if defined PIPE_ARCH_SSE
714 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
715 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
716 #elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN)
717 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
718 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
720 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
721 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
728 #include "lp_rast_tri_tmp.h"
732 #include "lp_rast_tri_tmp.h"
736 /*#define TRI_4 lp_rast_triangle_3_4*/
737 /*#define TRI_16 lp_rast_triangle_3_16*/
738 #include "lp_rast_tri_tmp.h"
742 /*#define TRI_16 lp_rast_triangle_4_16*/
743 #include "lp_rast_tri_tmp.h"
747 #include "lp_rast_tri_tmp.h"
751 #include "lp_rast_tri_tmp.h"
755 #include "lp_rast_tri_tmp.h"
759 #include "lp_rast_tri_tmp.h"
763 #define TAG(x) x##_32_1
765 #include "lp_rast_tri_tmp.h"
767 #define TAG(x) x##_32_2
769 #include "lp_rast_tri_tmp.h"
771 #define TAG(x) x##_32_3
773 /*#define TRI_4 lp_rast_triangle_3_4*/
774 /*#define TRI_16 lp_rast_triangle_3_16*/
775 #include "lp_rast_tri_tmp.h"
777 #define TAG(x) x##_32_4
780 #define TRI_16 lp_rast_triangle_32_4_16
782 #include "lp_rast_tri_tmp.h"
784 #define TAG(x) x##_32_5
786 #include "lp_rast_tri_tmp.h"
788 #define TAG(x) x##_32_6
790 #include "lp_rast_tri_tmp.h"
792 #define TAG(x) x##_32_7
794 #include "lp_rast_tri_tmp.h"
796 #define TAG(x) x##_32_8
798 #include "lp_rast_tri_tmp.h"
800 #define MULTISAMPLE 1
803 #define TAG(x) x##_ms_1
805 #include "lp_rast_tri_tmp.h"
807 #define TAG(x) x##_ms_2
809 #include "lp_rast_tri_tmp.h"
811 #define TAG(x) x##_ms_3
813 /*#define TRI_4 lp_rast_triangle_3_4*/
814 /*#define TRI_16 lp_rast_triangle_3_16*/
815 #include "lp_rast_tri_tmp.h"
817 #define TAG(x) x##_ms_4
819 /*#define TRI_16 lp_rast_triangle_4_16*/
820 #include "lp_rast_tri_tmp.h"
822 #define TAG(x) x##_ms_5
824 #include "lp_rast_tri_tmp.h"
826 #define TAG(x) x##_ms_6
828 #include "lp_rast_tri_tmp.h"
830 #define TAG(x) x##_ms_7
832 #include "lp_rast_tri_tmp.h"
834 #define TAG(x) x##_ms_8
836 #include "lp_rast_tri_tmp.h"