llvmpipe: Fix MSVC build.
[mesa.git] / src / gallium / drivers / llvmpipe / lp_rast_tri.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /*
29 * Rasterization for binned triangles within a tile
30 */
31
32 #include <limits.h>
33 #include "util/u_math.h"
34 #include "lp_debug.h"
35 #include "lp_debug_intrin.h"
36 #include "lp_perf.h"
37 #include "lp_rast_priv.h"
38 #include "lp_tile_soa.h"
39
40
41
42
43 /**
44 * Shade all pixels in a 4x4 block.
45 */
46 static void
47 block_full_4(struct lp_rasterizer_task *task,
48 const struct lp_rast_triangle *tri,
49 int x, int y)
50 {
51 lp_rast_shade_quads_all(task, &tri->inputs, x, y);
52 }
53
54
55 /**
56 * Shade all pixels in a 16x16 block.
57 */
58 static void
59 block_full_16(struct lp_rasterizer_task *task,
60 const struct lp_rast_triangle *tri,
61 int x, int y)
62 {
63 unsigned ix, iy;
64 assert(x % 16 == 0);
65 assert(y % 16 == 0);
66 for (iy = 0; iy < 16; iy += 4)
67 for (ix = 0; ix < 16; ix += 4)
68 block_full_4(task, tri, x + ix, y + iy);
69 }
70
71 #if !defined(PIPE_ARCH_SSE)
72
73 static INLINE unsigned
74 build_mask_linear(int c, int dcdx, int dcdy)
75 {
76 int mask = 0;
77
78 int c0 = c;
79 int c1 = c0 + dcdy;
80 int c2 = c1 + dcdy;
81 int c3 = c2 + dcdy;
82
83 mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
84 mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
85 mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
86 mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
87 mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
88 mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
89 mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
90 mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
91 mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
92 mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
93 mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
94 mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
95 mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
96 mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
97 mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
98 mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
99
100 return mask;
101 }
102
103
104 static INLINE void
105 build_masks(int c,
106 int cdiff,
107 int dcdx,
108 int dcdy,
109 unsigned *outmask,
110 unsigned *partmask)
111 {
112 *outmask |= build_mask_linear(c, dcdx, dcdy);
113 *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
114 }
115
116 void
117 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
118 const union lp_rast_cmd_arg arg)
119 {
120 union lp_rast_cmd_arg arg2;
121 arg2.triangle.tri = arg.triangle.tri;
122 arg2.triangle.plane_mask = (1<<3)-1;
123 lp_rast_triangle_3(task, arg2);
124 }
125
126 void
127 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
128 const union lp_rast_cmd_arg arg)
129 {
130 union lp_rast_cmd_arg arg2;
131 arg2.triangle.tri = arg.triangle.tri;
132 arg2.triangle.plane_mask = (1<<4)-1;
133 lp_rast_triangle_3(task, arg2);
134 }
135
136 void
137 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
138 const union lp_rast_cmd_arg arg)
139 {
140 lp_rast_triangle_3_16(task, arg);
141 }
142
143 #else
144 #include <emmintrin.h>
145 #include "util/u_sse.h"
146
147
148 static INLINE void
149 build_masks(int c,
150 int cdiff,
151 int dcdx,
152 int dcdy,
153 unsigned *outmask,
154 unsigned *partmask)
155 {
156 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
157 __m128i xdcdy = _mm_set1_epi32(dcdy);
158
159 /* Get values across the quad
160 */
161 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
162 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
163 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
164
165 {
166 __m128i cstep01, cstep23, result;
167
168 cstep01 = _mm_packs_epi32(cstep0, cstep1);
169 cstep23 = _mm_packs_epi32(cstep2, cstep3);
170 result = _mm_packs_epi16(cstep01, cstep23);
171
172 *outmask |= _mm_movemask_epi8(result);
173 }
174
175
176 {
177 __m128i cio4 = _mm_set1_epi32(cdiff);
178 __m128i cstep01, cstep23, result;
179
180 cstep0 = _mm_add_epi32(cstep0, cio4);
181 cstep1 = _mm_add_epi32(cstep1, cio4);
182 cstep2 = _mm_add_epi32(cstep2, cio4);
183 cstep3 = _mm_add_epi32(cstep3, cio4);
184
185 cstep01 = _mm_packs_epi32(cstep0, cstep1);
186 cstep23 = _mm_packs_epi32(cstep2, cstep3);
187 result = _mm_packs_epi16(cstep01, cstep23);
188
189 *partmask |= _mm_movemask_epi8(result);
190 }
191 }
192
193
194 static INLINE unsigned
195 build_mask_linear(int c, int dcdx, int dcdy)
196 {
197 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
198 __m128i xdcdy = _mm_set1_epi32(dcdy);
199
200 /* Get values across the quad
201 */
202 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
203 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
204 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
205
206 /* pack pairs of results into epi16
207 */
208 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
209 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
210
211 /* pack into epi8, preserving sign bits
212 */
213 __m128i result = _mm_packs_epi16(cstep01, cstep23);
214
215 /* extract sign bits to create mask
216 */
217 return _mm_movemask_epi8(result);
218 }
219
220 static INLINE unsigned
221 sign_bits4(const __m128i *cstep, int cdiff)
222 {
223
224 /* Adjust the step values
225 */
226 __m128i cio4 = _mm_set1_epi32(cdiff);
227 __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
228 __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
229 __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
230 __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
231
232 /* Pack down to epi8
233 */
234 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
235 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
236 __m128i result = _mm_packs_epi16(cstep01, cstep23);
237
238 /* Extract the sign bits
239 */
240 return _mm_movemask_epi8(result);
241 }
242
243 #endif
244
245
246
247
248 #define TAG(x) x##_1
249 #define NR_PLANES 1
250 #include "lp_rast_tri_tmp.h"
251
252 #define TAG(x) x##_2
253 #define NR_PLANES 2
254 #include "lp_rast_tri_tmp.h"
255
256 #define TAG(x) x##_3
257 #define NR_PLANES 3
258 /*#define TRI_4 lp_rast_triangle_3_4*/
259 /*#define TRI_16 lp_rast_triangle_3_16*/
260 #include "lp_rast_tri_tmp.h"
261
262 #define TAG(x) x##_4
263 #define NR_PLANES 4
264 #define TRI_16 lp_rast_triangle_4_16
265 #include "lp_rast_tri_tmp.h"
266
267 #define TAG(x) x##_5
268 #define NR_PLANES 5
269 #include "lp_rast_tri_tmp.h"
270
271 #define TAG(x) x##_6
272 #define NR_PLANES 6
273 #include "lp_rast_tri_tmp.h"
274
275 #define TAG(x) x##_7
276 #define NR_PLANES 7
277 #include "lp_rast_tri_tmp.h"
278
279 #define TAG(x) x##_8
280 #define NR_PLANES 8
281 #include "lp_rast_tri_tmp.h"
282
283
284 static INLINE void
285 transpose4_epi32(const __m128i * restrict a,
286 const __m128i * restrict b,
287 const __m128i * restrict c,
288 const __m128i * restrict d,
289 __m128i * restrict o,
290 __m128i * restrict p,
291 __m128i * restrict q,
292 __m128i * restrict r)
293 {
294 __m128i t0 = _mm_unpacklo_epi32(*a, *b);
295 __m128i t1 = _mm_unpacklo_epi32(*c, *d);
296 __m128i t2 = _mm_unpackhi_epi32(*a, *b);
297 __m128i t3 = _mm_unpackhi_epi32(*c, *d);
298
299 *o = _mm_unpacklo_epi64(t0, t1);
300 *p = _mm_unpackhi_epi64(t0, t1);
301 *q = _mm_unpacklo_epi64(t2, t3);
302 *r = _mm_unpackhi_epi64(t2, t3);
303 }
304
305
306 #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
307
308 #define NR_PLANES 3
309
310
311
312 /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
313 * _mm_mul_epu32().
314 *
315 * I suspect this works fine for us because one of our operands is
316 * always positive, but not sure that this can be used for general
317 * signed integer multiplication.
318 *
319 * This seems close enough to the speed of SSE4 and the real
320 * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
321 * dependency at this point.
322 */
323 static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
324 {
325 __m128i a4 = _mm_srli_si128(a, 4); /* shift by one dword */
326 __m128i b4 = _mm_srli_si128(b, 4); /* shift by one dword */
327 __m128i ba = _mm_mul_epu32(b, a); /* multply dwords 0, 2 */
328 __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
329
330 /* Interleave the results, either with shuffles or (slightly
331 * faster) direct bit operations:
332 */
333 #if 0
334 __m128i ba8 = _mm_shuffle_epi32(ba, 8);
335 __m128i b4a48 = _mm_shuffle_epi32(b4a4, 8);
336 __m128i result = _mm_unpacklo_epi32(ba8, b4a48);
337 #else
338 __m128i mask = _mm_setr_epi32(~0,0,~0,0);
339 __m128i ba_mask = _mm_and_si128(ba, mask);
340 __m128i b4a4_mask = _mm_and_si128(b4a4, mask);
341 __m128i b4a4_mask_shift = _mm_slli_si128(b4a4_mask, 4);
342 __m128i result = _mm_or_si128(ba_mask, b4a4_mask_shift);
343 #endif
344
345 return result;
346 }
347
348
349
350
351 void
352 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
353 const union lp_rast_cmd_arg arg)
354 {
355 const struct lp_rast_triangle *tri = arg.triangle.tri;
356 const struct lp_rast_plane *plane = tri->plane;
357 int x = (arg.triangle.plane_mask & 0xff) + task->x;
358 int y = (arg.triangle.plane_mask >> 8) + task->y;
359 unsigned i, j;
360
361 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
362 unsigned nr = 0;
363
364 __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
365 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
366 __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
367 __m128i zero = _mm_setzero_si128();
368
369 __m128i c;
370 __m128i dcdx;
371 __m128i dcdy;
372 __m128i rej4;
373
374 __m128i dcdx2;
375 __m128i dcdx3;
376
377 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
378 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
379 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
380 __m128i unused;
381
382 transpose4_epi32(&p0, &p1, &p2, &zero,
383 &c, &dcdx, &dcdy, &rej4);
384
385 /* Adjust dcdx;
386 */
387 dcdx = _mm_sub_epi32(zero, dcdx);
388
389 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
390 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
391 rej4 = _mm_slli_epi32(rej4, 2);
392
393 dcdx2 = _mm_add_epi32(dcdx, dcdx);
394 dcdx3 = _mm_add_epi32(dcdx2, dcdx);
395
396 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
397 &span_0, &span_1, &span_2, &unused);
398
399 for (i = 0; i < 4; i++) {
400 __m128i cx = c;
401
402 for (j = 0; j < 4; j++) {
403 __m128i c4rej = _mm_add_epi32(cx, rej4);
404 __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
405
406 /* if (is_zero(rej_masks)) */
407 if (_mm_movemask_epi8(rej_masks) == 0) {
408 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
409 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
410 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
411
412 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
413
414 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
415 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
416 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
417
418 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
419 __m128i c_01 = _mm_packs_epi32(c_0, c_1);
420
421 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
422 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
423 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
424
425 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
426
427 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
428 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
429 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
430
431 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
432 __m128i c_23 = _mm_packs_epi32(c_2, c_3);
433 __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
434
435 unsigned mask = _mm_movemask_epi8(c_0123);
436
437 out[nr].i = i;
438 out[nr].j = j;
439 out[nr].mask = mask;
440 if (mask != 0xffff)
441 nr++;
442 }
443 cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
444 }
445
446 c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
447 }
448
449 for (i = 0; i < nr; i++)
450 lp_rast_shade_quads_mask(task,
451 &tri->inputs,
452 x + 4 * out[i].j,
453 y + 4 * out[i].i,
454 0xffff & ~out[i].mask);
455 }
456
457
458
459
460
461 void
462 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
463 const union lp_rast_cmd_arg arg)
464 {
465 const struct lp_rast_triangle *tri = arg.triangle.tri;
466 const struct lp_rast_plane *plane = tri->plane;
467 int x = (arg.triangle.plane_mask & 0xff) + task->x;
468 int y = (arg.triangle.plane_mask >> 8) + task->y;
469
470 __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
471 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
472 __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
473 __m128i zero = _mm_setzero_si128();
474
475 __m128i c;
476 __m128i dcdx;
477 __m128i dcdy;
478
479 __m128i dcdx2;
480 __m128i dcdx3;
481
482 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
483 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
484 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
485 __m128i unused;
486
487 transpose4_epi32(&p0, &p1, &p2, &zero,
488 &c, &dcdx, &dcdy, &unused);
489
490 /* Adjust dcdx;
491 */
492 dcdx = _mm_sub_epi32(zero, dcdx);
493
494 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
495 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
496
497 dcdx2 = _mm_add_epi32(dcdx, dcdx);
498 dcdx3 = _mm_add_epi32(dcdx2, dcdx);
499
500 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
501 &span_0, &span_1, &span_2, &unused);
502
503
504 {
505 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
506 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
507 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
508
509 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
510
511 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
512 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
513 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
514
515 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
516 __m128i c_01 = _mm_packs_epi32(c_0, c_1);
517
518 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
519 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
520 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
521
522 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
523
524 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
525 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
526 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
527
528 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
529 __m128i c_23 = _mm_packs_epi32(c_2, c_3);
530 __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
531
532 unsigned mask = _mm_movemask_epi8(c_0123);
533
534 if (mask != 0xffff)
535 lp_rast_shade_quads_mask(task,
536 &tri->inputs,
537 x,
538 y,
539 0xffff & ~mask);
540 }
541 }