scons: Use -Werror MSVC compatibility flags per-directory.
[mesa.git] / src / gallium / drivers / llvmpipe / lp_rast_tri.c
1 /**************************************************************************
2 *
3 * Copyright 2007-2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /*
29 * Rasterization for binned triangles within a tile
30 */
31
32 #include <limits.h>
33 #include "util/u_math.h"
34 #include "lp_debug.h"
35 #include "lp_perf.h"
36 #include "lp_rast_priv.h"
37
38 /**
39 * Shade all pixels in a 4x4 block.
40 */
41 static void
42 block_full_4(struct lp_rasterizer_task *task,
43 const struct lp_rast_triangle *tri,
44 int x, int y)
45 {
46 lp_rast_shade_quads_all(task, &tri->inputs, x, y);
47 }
48
49
50 /**
51 * Shade all pixels in a 16x16 block.
52 */
53 static void
54 block_full_16(struct lp_rasterizer_task *task,
55 const struct lp_rast_triangle *tri,
56 int x, int y)
57 {
58 unsigned ix, iy;
59 assert(x % 16 == 0);
60 assert(y % 16 == 0);
61 for (iy = 0; iy < 16; iy += 4)
62 for (ix = 0; ix < 16; ix += 4)
63 block_full_4(task, tri, x + ix, y + iy);
64 }
65
66 static INLINE unsigned
67 build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
68 {
69 unsigned mask = 0;
70
71 int64_t c0 = c;
72 int64_t c1 = c0 + dcdy;
73 int64_t c2 = c1 + dcdy;
74 int64_t c3 = c2 + dcdy;
75
76 mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0);
77 mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1);
78 mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2);
79 mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3);
80 mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4);
81 mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5);
82 mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6);
83 mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7);
84 mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8);
85 mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9);
86 mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10);
87 mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11);
88 mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12);
89 mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13);
90 mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14);
91 mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15);
92
93 return mask;
94 }
95
96
97 static INLINE void
98 build_masks(int64_t c,
99 int64_t cdiff,
100 int64_t dcdx,
101 int64_t dcdy,
102 unsigned *outmask,
103 unsigned *partmask)
104 {
105 *outmask |= build_mask_linear(c, dcdx, dcdy);
106 *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
107 }
108
109 void
110 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
111 const union lp_rast_cmd_arg arg)
112 {
113 union lp_rast_cmd_arg arg2;
114 arg2.triangle.tri = arg.triangle.tri;
115 arg2.triangle.plane_mask = (1<<3)-1;
116 lp_rast_triangle_3(task, arg2);
117 }
118
119 void
120 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
121 const union lp_rast_cmd_arg arg)
122 {
123 lp_rast_triangle_3_16(task, arg);
124 }
125
126 void
127 lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
128 const union lp_rast_cmd_arg arg)
129 {
130 union lp_rast_cmd_arg arg2;
131 arg2.triangle.tri = arg.triangle.tri;
132 arg2.triangle.plane_mask = (1<<4)-1;
133 lp_rast_triangle_4(task, arg2);
134 }
135
136 #if !defined(PIPE_ARCH_SSE)
137
138 void
139 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
140 const union lp_rast_cmd_arg arg)
141 {
142 union lp_rast_cmd_arg arg2;
143 arg2.triangle.tri = arg.triangle.tri;
144 arg2.triangle.plane_mask = (1<<3)-1;
145 lp_rast_triangle_32_3(task, arg2);
146 }
147
148 void
149 lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
150 const union lp_rast_cmd_arg arg)
151 {
152 union lp_rast_cmd_arg arg2;
153 arg2.triangle.tri = arg.triangle.tri;
154 arg2.triangle.plane_mask = (1<<4)-1;
155 lp_rast_triangle_32_4(task, arg2);
156 }
157
158 void
159 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
160 const union lp_rast_cmd_arg arg)
161 {
162 lp_rast_triangle_32_3_16(task, arg);
163 }
164
165 #else
166 #include <emmintrin.h>
167 #include "util/u_sse.h"
168
169
170 static INLINE void
171 build_masks_32(int c,
172 int cdiff,
173 int dcdx,
174 int dcdy,
175 unsigned *outmask,
176 unsigned *partmask)
177 {
178 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
179 __m128i xdcdy = _mm_set1_epi32(dcdy);
180
181 /* Get values across the quad
182 */
183 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
184 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
185 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
186
187 {
188 __m128i cstep01, cstep23, result;
189
190 cstep01 = _mm_packs_epi32(cstep0, cstep1);
191 cstep23 = _mm_packs_epi32(cstep2, cstep3);
192 result = _mm_packs_epi16(cstep01, cstep23);
193
194 *outmask |= _mm_movemask_epi8(result);
195 }
196
197
198 {
199 __m128i cio4 = _mm_set1_epi32(cdiff);
200 __m128i cstep01, cstep23, result;
201
202 cstep0 = _mm_add_epi32(cstep0, cio4);
203 cstep1 = _mm_add_epi32(cstep1, cio4);
204 cstep2 = _mm_add_epi32(cstep2, cio4);
205 cstep3 = _mm_add_epi32(cstep3, cio4);
206
207 cstep01 = _mm_packs_epi32(cstep0, cstep1);
208 cstep23 = _mm_packs_epi32(cstep2, cstep3);
209 result = _mm_packs_epi16(cstep01, cstep23);
210
211 *partmask |= _mm_movemask_epi8(result);
212 }
213 }
214
215
216 static INLINE unsigned
217 build_mask_linear_32(int c, int dcdx, int dcdy)
218 {
219 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
220 __m128i xdcdy = _mm_set1_epi32(dcdy);
221
222 /* Get values across the quad
223 */
224 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
225 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
226 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
227
228 /* pack pairs of results into epi16
229 */
230 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
231 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
232
233 /* pack into epi8, preserving sign bits
234 */
235 __m128i result = _mm_packs_epi16(cstep01, cstep23);
236
237 /* extract sign bits to create mask
238 */
239 return _mm_movemask_epi8(result);
240 }
241
242 static INLINE unsigned
243 sign_bits4(const __m128i *cstep, int cdiff)
244 {
245
246 /* Adjust the step values
247 */
248 __m128i cio4 = _mm_set1_epi32(cdiff);
249 __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
250 __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
251 __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
252 __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
253
254 /* Pack down to epi8
255 */
256 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
257 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
258 __m128i result = _mm_packs_epi16(cstep01, cstep23);
259
260 /* Extract the sign bits
261 */
262 return _mm_movemask_epi8(result);
263 }
264
265
266 #define NR_PLANES 3
267
268
269
270
271
272
273
274 void
275 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
276 const union lp_rast_cmd_arg arg)
277 {
278 const struct lp_rast_triangle *tri = arg.triangle.tri;
279 const struct lp_rast_plane *plane = GET_PLANES(tri);
280 int x = (arg.triangle.plane_mask & 0xff) + task->x;
281 int y = (arg.triangle.plane_mask >> 8) + task->y;
282 unsigned i, j;
283
284 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
285 unsigned nr = 0;
286
287 __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
288 __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
289 __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
290 __m128i zero = _mm_setzero_si128();
291
292 __m128i c;
293 __m128i dcdx;
294 __m128i dcdy;
295 __m128i rej4;
296
297 __m128i dcdx2;
298 __m128i dcdx3;
299
300 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
301 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
302 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
303 __m128i unused;
304
305 transpose4_epi32(&p0, &p1, &p2, &zero,
306 &c, &dcdx, &dcdy, &rej4);
307
308 /* Adjust dcdx;
309 */
310 dcdx = _mm_sub_epi32(zero, dcdx);
311
312 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
313 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
314 rej4 = _mm_slli_epi32(rej4, 2);
315
316 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
317 c = _mm_sub_epi32(c, _mm_set1_epi32(1));
318 rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
319
320 dcdx2 = _mm_add_epi32(dcdx, dcdx);
321 dcdx3 = _mm_add_epi32(dcdx2, dcdx);
322
323 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
324 &span_0, &span_1, &span_2, &unused);
325
326 for (i = 0; i < 4; i++) {
327 __m128i cx = c;
328
329 for (j = 0; j < 4; j++) {
330 __m128i c4rej = _mm_add_epi32(cx, rej4);
331 __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
332
333 /* if (is_zero(rej_masks)) */
334 if (_mm_movemask_epi8(rej_masks) == 0) {
335 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
336 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
337 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
338
339 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
340
341 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
342 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
343 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
344
345 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
346 __m128i c_01 = _mm_packs_epi32(c_0, c_1);
347
348 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
349 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
350 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
351
352 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
353
354 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
355 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
356 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
357
358 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
359 __m128i c_23 = _mm_packs_epi32(c_2, c_3);
360 __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
361
362 unsigned mask = _mm_movemask_epi8(c_0123);
363
364 out[nr].i = i;
365 out[nr].j = j;
366 out[nr].mask = mask;
367 if (mask != 0xffff)
368 nr++;
369 }
370 cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
371 }
372
373 c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
374 }
375
376 for (i = 0; i < nr; i++)
377 lp_rast_shade_quads_mask(task,
378 &tri->inputs,
379 x + 4 * out[i].j,
380 y + 4 * out[i].i,
381 0xffff & ~out[i].mask);
382 }
383
384
385
386
387
388 void
389 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
390 const union lp_rast_cmd_arg arg)
391 {
392 const struct lp_rast_triangle *tri = arg.triangle.tri;
393 const struct lp_rast_plane *plane = GET_PLANES(tri);
394 unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
395 unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
396
397 __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
398 __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
399 __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
400 __m128i zero = _mm_setzero_si128();
401
402 __m128i c;
403 __m128i dcdx;
404 __m128i dcdy;
405
406 __m128i dcdx2;
407 __m128i dcdx3;
408
409 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
410 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
411 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
412 __m128i unused;
413
414 transpose4_epi32(&p0, &p1, &p2, &zero,
415 &c, &dcdx, &dcdy, &unused);
416
417 /* Adjust dcdx;
418 */
419 dcdx = _mm_sub_epi32(zero, dcdx);
420
421 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
422 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
423
424 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
425 c = _mm_sub_epi32(c, _mm_set1_epi32(1));
426
427 dcdx2 = _mm_add_epi32(dcdx, dcdx);
428 dcdx3 = _mm_add_epi32(dcdx2, dcdx);
429
430 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
431 &span_0, &span_1, &span_2, &unused);
432
433
434 {
435 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
436 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
437 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
438
439 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
440
441 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
442 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
443 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
444
445 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
446 __m128i c_01 = _mm_packs_epi32(c_0, c_1);
447
448 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
449 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
450 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
451
452 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
453
454 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
455 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
456 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
457
458 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
459 __m128i c_23 = _mm_packs_epi32(c_2, c_3);
460 __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
461
462 unsigned mask = _mm_movemask_epi8(c_0123);
463
464 if (mask != 0xffff)
465 lp_rast_shade_quads_mask(task,
466 &tri->inputs,
467 x,
468 y,
469 0xffff & ~mask);
470 }
471 }
472
473 #undef NR_PLANES
474 #endif
475
476
477 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
478 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
479
480 #define TAG(x) x##_1
481 #define NR_PLANES 1
482 #include "lp_rast_tri_tmp.h"
483
484 #define TAG(x) x##_2
485 #define NR_PLANES 2
486 #include "lp_rast_tri_tmp.h"
487
488 #define TAG(x) x##_3
489 #define NR_PLANES 3
490 /*#define TRI_4 lp_rast_triangle_3_4*/
491 /*#define TRI_16 lp_rast_triangle_3_16*/
492 #include "lp_rast_tri_tmp.h"
493
494 #define TAG(x) x##_4
495 #define NR_PLANES 4
496 /*#define TRI_16 lp_rast_triangle_4_16*/
497 #include "lp_rast_tri_tmp.h"
498
499 #define TAG(x) x##_5
500 #define NR_PLANES 5
501 #include "lp_rast_tri_tmp.h"
502
503 #define TAG(x) x##_6
504 #define NR_PLANES 6
505 #include "lp_rast_tri_tmp.h"
506
507 #define TAG(x) x##_7
508 #define NR_PLANES 7
509 #include "lp_rast_tri_tmp.h"
510
511 #define TAG(x) x##_8
512 #define NR_PLANES 8
513 #include "lp_rast_tri_tmp.h"
514
515 #ifdef PIPE_ARCH_SSE
516 #undef BUILD_MASKS
517 #undef BUILD_MASK_LINEAR
518 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
519 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
520 #endif
521
522 #define TAG(x) x##_32_1
523 #define NR_PLANES 1
524 #include "lp_rast_tri_tmp.h"
525
526 #define TAG(x) x##_32_2
527 #define NR_PLANES 2
528 #include "lp_rast_tri_tmp.h"
529
530 #define TAG(x) x##_32_3
531 #define NR_PLANES 3
532 /*#define TRI_4 lp_rast_triangle_3_4*/
533 /*#define TRI_16 lp_rast_triangle_3_16*/
534 #include "lp_rast_tri_tmp.h"
535
536 #define TAG(x) x##_32_4
537 #define NR_PLANES 4
538 #ifdef PIPE_ARCH_SSE
539 #define TRI_16 lp_rast_triangle_32_4_16
540 #endif
541 #include "lp_rast_tri_tmp.h"
542
543 #define TAG(x) x##_32_5
544 #define NR_PLANES 5
545 #include "lp_rast_tri_tmp.h"
546
547 #define TAG(x) x##_32_6
548 #define NR_PLANES 6
549 #include "lp_rast_tri_tmp.h"
550
551 #define TAG(x) x##_32_7
552 #define NR_PLANES 7
553 #include "lp_rast_tri_tmp.h"
554
555 #define TAG(x) x##_32_8
556 #define NR_PLANES 8
557 #include "lp_rast_tri_tmp.h"
558