41f6fbfa0596693adad73b00675af73ac9bca3e9
1 /**************************************************************************
3 * Copyright 2007-2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 * Rasterization for binned triangles within a tile
33 #include "util/u_math.h"
36 #include "lp_rast_priv.h"
39 * Shade all pixels in a 4x4 block.
42 block_full_4(struct lp_rasterizer_task
*task
,
43 const struct lp_rast_triangle
*tri
,
46 lp_rast_shade_quads_all(task
, &tri
->inputs
, x
, y
);
51 * Shade all pixels in a 16x16 block.
54 block_full_16(struct lp_rasterizer_task
*task
,
55 const struct lp_rast_triangle
*tri
,
61 for (iy
= 0; iy
< 16; iy
+= 4)
62 for (ix
= 0; ix
< 16; ix
+= 4)
63 block_full_4(task
, tri
, x
+ ix
, y
+ iy
);
66 static INLINE
unsigned
67 build_mask_linear(int64_t c
, int64_t dcdx
, int64_t dcdy
)
72 int64_t c1
= c0
+ dcdy
;
73 int64_t c2
= c1
+ dcdy
;
74 int64_t c3
= c2
+ dcdy
;
76 mask
|= ((c0
+ 0 * dcdx
) >> FIXED_SHIFT
) & (1 << 0);
77 mask
|= ((c0
+ 1 * dcdx
) >> FIXED_SHIFT
) & (1 << 1);
78 mask
|= ((c0
+ 2 * dcdx
) >> FIXED_SHIFT
) & (1 << 2);
79 mask
|= ((c0
+ 3 * dcdx
) >> FIXED_SHIFT
) & (1 << 3);
80 mask
|= ((c1
+ 0 * dcdx
) >> FIXED_SHIFT
) & (1 << 4);
81 mask
|= ((c1
+ 1 * dcdx
) >> FIXED_SHIFT
) & (1 << 5);
82 mask
|= ((c1
+ 2 * dcdx
) >> FIXED_SHIFT
) & (1 << 6);
83 mask
|= ((c1
+ 3 * dcdx
) >> FIXED_SHIFT
) & (1 << 7);
84 mask
|= ((c2
+ 0 * dcdx
) >> FIXED_SHIFT
) & (1 << 8);
85 mask
|= ((c2
+ 1 * dcdx
) >> FIXED_SHIFT
) & (1 << 9);
86 mask
|= ((c2
+ 2 * dcdx
) >> FIXED_SHIFT
) & (1 << 10);
87 mask
|= ((c2
+ 3 * dcdx
) >> FIXED_SHIFT
) & (1 << 11);
88 mask
|= ((c3
+ 0 * dcdx
) >> FIXED_SHIFT
) & (1 << 12);
89 mask
|= ((c3
+ 1 * dcdx
) >> FIXED_SHIFT
) & (1 << 13);
90 mask
|= ((c3
+ 2 * dcdx
) >> FIXED_SHIFT
) & (1 << 14);
91 mask
|= ((c3
+ 3 * dcdx
) >> FIXED_SHIFT
) & (1 << 15);
98 build_masks(int64_t c
,
105 *outmask
|= build_mask_linear(c
, dcdx
, dcdy
);
106 *partmask
|= build_mask_linear(c
+ cdiff
, dcdx
, dcdy
);
110 lp_rast_triangle_3_16(struct lp_rasterizer_task
*task
,
111 const union lp_rast_cmd_arg arg
)
113 union lp_rast_cmd_arg arg2
;
114 arg2
.triangle
.tri
= arg
.triangle
.tri
;
115 arg2
.triangle
.plane_mask
= (1<<3)-1;
116 lp_rast_triangle_3(task
, arg2
);
120 lp_rast_triangle_3_4(struct lp_rasterizer_task
*task
,
121 const union lp_rast_cmd_arg arg
)
123 lp_rast_triangle_3_16(task
, arg
);
127 lp_rast_triangle_4_16(struct lp_rasterizer_task
*task
,
128 const union lp_rast_cmd_arg arg
)
130 union lp_rast_cmd_arg arg2
;
131 arg2
.triangle
.tri
= arg
.triangle
.tri
;
132 arg2
.triangle
.plane_mask
= (1<<4)-1;
133 lp_rast_triangle_4(task
, arg2
);
136 #if !defined(PIPE_ARCH_SSE)
139 lp_rast_triangle_32_3_16(struct lp_rasterizer_task
*task
,
140 const union lp_rast_cmd_arg arg
)
142 union lp_rast_cmd_arg arg2
;
143 arg2
.triangle
.tri
= arg
.triangle
.tri
;
144 arg2
.triangle
.plane_mask
= (1<<3)-1;
145 lp_rast_triangle_32_3(task
, arg2
);
149 lp_rast_triangle_32_4_16(struct lp_rasterizer_task
*task
,
150 const union lp_rast_cmd_arg arg
)
152 union lp_rast_cmd_arg arg2
;
153 arg2
.triangle
.tri
= arg
.triangle
.tri
;
154 arg2
.triangle
.plane_mask
= (1<<4)-1;
155 lp_rast_triangle_32_4(task
, arg2
);
159 lp_rast_triangle_32_3_4(struct lp_rasterizer_task
*task
,
160 const union lp_rast_cmd_arg arg
)
162 lp_rast_triangle_32_3_16(task
, arg
);
166 #include <emmintrin.h>
167 #include "util/u_sse.h"
171 build_masks_32(int c
,
178 __m128i cstep0
= _mm_setr_epi32(c
, c
+dcdx
, c
+dcdx
*2, c
+dcdx
*3);
179 __m128i xdcdy
= _mm_set1_epi32(dcdy
);
181 /* Get values across the quad
183 __m128i cstep1
= _mm_add_epi32(cstep0
, xdcdy
);
184 __m128i cstep2
= _mm_add_epi32(cstep1
, xdcdy
);
185 __m128i cstep3
= _mm_add_epi32(cstep2
, xdcdy
);
188 __m128i cstep01
, cstep23
, result
;
190 cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
191 cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
192 result
= _mm_packs_epi16(cstep01
, cstep23
);
194 *outmask
|= _mm_movemask_epi8(result
);
199 __m128i cio4
= _mm_set1_epi32(cdiff
);
200 __m128i cstep01
, cstep23
, result
;
202 cstep0
= _mm_add_epi32(cstep0
, cio4
);
203 cstep1
= _mm_add_epi32(cstep1
, cio4
);
204 cstep2
= _mm_add_epi32(cstep2
, cio4
);
205 cstep3
= _mm_add_epi32(cstep3
, cio4
);
207 cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
208 cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
209 result
= _mm_packs_epi16(cstep01
, cstep23
);
211 *partmask
|= _mm_movemask_epi8(result
);
216 static INLINE
unsigned
217 build_mask_linear_32(int c
, int dcdx
, int dcdy
)
219 __m128i cstep0
= _mm_setr_epi32(c
, c
+dcdx
, c
+dcdx
*2, c
+dcdx
*3);
220 __m128i xdcdy
= _mm_set1_epi32(dcdy
);
222 /* Get values across the quad
224 __m128i cstep1
= _mm_add_epi32(cstep0
, xdcdy
);
225 __m128i cstep2
= _mm_add_epi32(cstep1
, xdcdy
);
226 __m128i cstep3
= _mm_add_epi32(cstep2
, xdcdy
);
228 /* pack pairs of results into epi16
230 __m128i cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
231 __m128i cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
233 /* pack into epi8, preserving sign bits
235 __m128i result
= _mm_packs_epi16(cstep01
, cstep23
);
237 /* extract sign bits to create mask
239 return _mm_movemask_epi8(result
);
242 static INLINE
unsigned
243 sign_bits4(const __m128i
*cstep
, int cdiff
)
246 /* Adjust the step values
248 __m128i cio4
= _mm_set1_epi32(cdiff
);
249 __m128i cstep0
= _mm_add_epi32(cstep
[0], cio4
);
250 __m128i cstep1
= _mm_add_epi32(cstep
[1], cio4
);
251 __m128i cstep2
= _mm_add_epi32(cstep
[2], cio4
);
252 __m128i cstep3
= _mm_add_epi32(cstep
[3], cio4
);
256 __m128i cstep01
= _mm_packs_epi32(cstep0
, cstep1
);
257 __m128i cstep23
= _mm_packs_epi32(cstep2
, cstep3
);
258 __m128i result
= _mm_packs_epi16(cstep01
, cstep23
);
260 /* Extract the sign bits
262 return _mm_movemask_epi8(result
);
275 lp_rast_triangle_32_3_16(struct lp_rasterizer_task
*task
,
276 const union lp_rast_cmd_arg arg
)
278 const struct lp_rast_triangle
*tri
= arg
.triangle
.tri
;
279 const struct lp_rast_plane
*plane
= GET_PLANES(tri
);
280 int x
= (arg
.triangle
.plane_mask
& 0xff) + task
->x
;
281 int y
= (arg
.triangle
.plane_mask
>> 8) + task
->y
;
284 struct { unsigned mask
:16; unsigned i
:8; unsigned j
:8; } out
[16];
287 __m128i p0
= lp_plane_to_m128i(&plane
[0]); /* c, dcdx, dcdy, eo */
288 __m128i p1
= lp_plane_to_m128i(&plane
[1]); /* c, dcdx, dcdy, eo */
289 __m128i p2
= lp_plane_to_m128i(&plane
[2]); /* c, dcdx, dcdy, eo */
290 __m128i zero
= _mm_setzero_si128();
300 __m128i span_0
; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
301 __m128i span_1
; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
302 __m128i span_2
; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
305 transpose4_epi32(&p0
, &p1
, &p2
, &zero
,
306 &c
, &dcdx
, &dcdy
, &rej4
);
310 dcdx
= _mm_sub_epi32(zero
, dcdx
);
312 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdx
, _mm_set1_epi32(x
)));
313 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdy
, _mm_set1_epi32(y
)));
314 rej4
= _mm_slli_epi32(rej4
, 2);
316 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
317 c
= _mm_sub_epi32(c
, _mm_set1_epi32(1));
318 rej4
= _mm_add_epi32(rej4
, _mm_set1_epi32(1));
320 dcdx2
= _mm_add_epi32(dcdx
, dcdx
);
321 dcdx3
= _mm_add_epi32(dcdx2
, dcdx
);
323 transpose4_epi32(&zero
, &dcdx
, &dcdx2
, &dcdx3
,
324 &span_0
, &span_1
, &span_2
, &unused
);
326 for (i
= 0; i
< 4; i
++) {
329 for (j
= 0; j
< 4; j
++) {
330 __m128i c4rej
= _mm_add_epi32(cx
, rej4
);
331 __m128i rej_masks
= _mm_srai_epi32(c4rej
, 31);
333 /* if (is_zero(rej_masks)) */
334 if (_mm_movemask_epi8(rej_masks
) == 0) {
335 __m128i c0_0
= _mm_add_epi32(SCALAR_EPI32(cx
, 0), span_0
);
336 __m128i c1_0
= _mm_add_epi32(SCALAR_EPI32(cx
, 1), span_1
);
337 __m128i c2_0
= _mm_add_epi32(SCALAR_EPI32(cx
, 2), span_2
);
339 __m128i c_0
= _mm_or_si128(_mm_or_si128(c0_0
, c1_0
), c2_0
);
341 __m128i c0_1
= _mm_add_epi32(c0_0
, SCALAR_EPI32(dcdy
, 0));
342 __m128i c1_1
= _mm_add_epi32(c1_0
, SCALAR_EPI32(dcdy
, 1));
343 __m128i c2_1
= _mm_add_epi32(c2_0
, SCALAR_EPI32(dcdy
, 2));
345 __m128i c_1
= _mm_or_si128(_mm_or_si128(c0_1
, c1_1
), c2_1
);
346 __m128i c_01
= _mm_packs_epi32(c_0
, c_1
);
348 __m128i c0_2
= _mm_add_epi32(c0_1
, SCALAR_EPI32(dcdy
, 0));
349 __m128i c1_2
= _mm_add_epi32(c1_1
, SCALAR_EPI32(dcdy
, 1));
350 __m128i c2_2
= _mm_add_epi32(c2_1
, SCALAR_EPI32(dcdy
, 2));
352 __m128i c_2
= _mm_or_si128(_mm_or_si128(c0_2
, c1_2
), c2_2
);
354 __m128i c0_3
= _mm_add_epi32(c0_2
, SCALAR_EPI32(dcdy
, 0));
355 __m128i c1_3
= _mm_add_epi32(c1_2
, SCALAR_EPI32(dcdy
, 1));
356 __m128i c2_3
= _mm_add_epi32(c2_2
, SCALAR_EPI32(dcdy
, 2));
358 __m128i c_3
= _mm_or_si128(_mm_or_si128(c0_3
, c1_3
), c2_3
);
359 __m128i c_23
= _mm_packs_epi32(c_2
, c_3
);
360 __m128i c_0123
= _mm_packs_epi16(c_01
, c_23
);
362 unsigned mask
= _mm_movemask_epi8(c_0123
);
370 cx
= _mm_add_epi32(cx
, _mm_slli_epi32(dcdx
, 2));
373 c
= _mm_add_epi32(c
, _mm_slli_epi32(dcdy
, 2));
376 for (i
= 0; i
< nr
; i
++)
377 lp_rast_shade_quads_mask(task
,
381 0xffff & ~out
[i
].mask
);
389 lp_rast_triangle_32_3_4(struct lp_rasterizer_task
*task
,
390 const union lp_rast_cmd_arg arg
)
392 const struct lp_rast_triangle
*tri
= arg
.triangle
.tri
;
393 const struct lp_rast_plane
*plane
= GET_PLANES(tri
);
394 unsigned x
= (arg
.triangle
.plane_mask
& 0xff) + task
->x
;
395 unsigned y
= (arg
.triangle
.plane_mask
>> 8) + task
->y
;
397 __m128i p0
= lp_plane_to_m128i(&plane
[0]); /* c, dcdx, dcdy, eo */
398 __m128i p1
= lp_plane_to_m128i(&plane
[1]); /* c, dcdx, dcdy, eo */
399 __m128i p2
= lp_plane_to_m128i(&plane
[2]); /* c, dcdx, dcdy, eo */
400 __m128i zero
= _mm_setzero_si128();
409 __m128i span_0
; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
410 __m128i span_1
; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
411 __m128i span_2
; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
414 transpose4_epi32(&p0
, &p1
, &p2
, &zero
,
415 &c
, &dcdx
, &dcdy
, &unused
);
419 dcdx
= _mm_sub_epi32(zero
, dcdx
);
421 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdx
, _mm_set1_epi32(x
)));
422 c
= _mm_add_epi32(c
, mm_mullo_epi32(dcdy
, _mm_set1_epi32(y
)));
424 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
425 c
= _mm_sub_epi32(c
, _mm_set1_epi32(1));
427 dcdx2
= _mm_add_epi32(dcdx
, dcdx
);
428 dcdx3
= _mm_add_epi32(dcdx2
, dcdx
);
430 transpose4_epi32(&zero
, &dcdx
, &dcdx2
, &dcdx3
,
431 &span_0
, &span_1
, &span_2
, &unused
);
435 __m128i c0_0
= _mm_add_epi32(SCALAR_EPI32(c
, 0), span_0
);
436 __m128i c1_0
= _mm_add_epi32(SCALAR_EPI32(c
, 1), span_1
);
437 __m128i c2_0
= _mm_add_epi32(SCALAR_EPI32(c
, 2), span_2
);
439 __m128i c_0
= _mm_or_si128(_mm_or_si128(c0_0
, c1_0
), c2_0
);
441 __m128i c0_1
= _mm_add_epi32(c0_0
, SCALAR_EPI32(dcdy
, 0));
442 __m128i c1_1
= _mm_add_epi32(c1_0
, SCALAR_EPI32(dcdy
, 1));
443 __m128i c2_1
= _mm_add_epi32(c2_0
, SCALAR_EPI32(dcdy
, 2));
445 __m128i c_1
= _mm_or_si128(_mm_or_si128(c0_1
, c1_1
), c2_1
);
446 __m128i c_01
= _mm_packs_epi32(c_0
, c_1
);
448 __m128i c0_2
= _mm_add_epi32(c0_1
, SCALAR_EPI32(dcdy
, 0));
449 __m128i c1_2
= _mm_add_epi32(c1_1
, SCALAR_EPI32(dcdy
, 1));
450 __m128i c2_2
= _mm_add_epi32(c2_1
, SCALAR_EPI32(dcdy
, 2));
452 __m128i c_2
= _mm_or_si128(_mm_or_si128(c0_2
, c1_2
), c2_2
);
454 __m128i c0_3
= _mm_add_epi32(c0_2
, SCALAR_EPI32(dcdy
, 0));
455 __m128i c1_3
= _mm_add_epi32(c1_2
, SCALAR_EPI32(dcdy
, 1));
456 __m128i c2_3
= _mm_add_epi32(c2_2
, SCALAR_EPI32(dcdy
, 2));
458 __m128i c_3
= _mm_or_si128(_mm_or_si128(c0_3
, c1_3
), c2_3
);
459 __m128i c_23
= _mm_packs_epi32(c_2
, c_3
);
460 __m128i c_0123
= _mm_packs_epi16(c_01
, c_23
);
462 unsigned mask
= _mm_movemask_epi8(c_0123
);
465 lp_rast_shade_quads_mask(task
,
477 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
478 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
482 #include "lp_rast_tri_tmp.h"
486 #include "lp_rast_tri_tmp.h"
490 /*#define TRI_4 lp_rast_triangle_3_4*/
491 /*#define TRI_16 lp_rast_triangle_3_16*/
492 #include "lp_rast_tri_tmp.h"
496 /*#define TRI_16 lp_rast_triangle_4_16*/
497 #include "lp_rast_tri_tmp.h"
501 #include "lp_rast_tri_tmp.h"
505 #include "lp_rast_tri_tmp.h"
509 #include "lp_rast_tri_tmp.h"
513 #include "lp_rast_tri_tmp.h"
517 #undef BUILD_MASK_LINEAR
518 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
519 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
522 #define TAG(x) x##_32_1
524 #include "lp_rast_tri_tmp.h"
526 #define TAG(x) x##_32_2
528 #include "lp_rast_tri_tmp.h"
530 #define TAG(x) x##_32_3
532 /*#define TRI_4 lp_rast_triangle_3_4*/
533 /*#define TRI_16 lp_rast_triangle_3_16*/
534 #include "lp_rast_tri_tmp.h"
536 #define TAG(x) x##_32_4
539 #define TRI_16 lp_rast_triangle_32_4_16
541 #include "lp_rast_tri_tmp.h"
543 #define TAG(x) x##_32_5
545 #include "lp_rast_tri_tmp.h"
547 #define TAG(x) x##_32_6
549 #include "lp_rast_tri_tmp.h"
551 #define TAG(x) x##_32_7
553 #include "lp_rast_tri_tmp.h"
555 #define TAG(x) x##_32_8
557 #include "lp_rast_tri_tmp.h"