swr: [rasterizer core] implement InnerConservative input coverage
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa_avx.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa_avx.cpp
24 *
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34
35 #if (KNOB_SIMD_WIDTH == 8)
36
37 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
38 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
39 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
40 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
41
42 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
43 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
44 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
45
46 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
47 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
48 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
49
50 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
51 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
52 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
53
54 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
55 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
56
57 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
58 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
59 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]);
60
61 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
62 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
63 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]);
64
65 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
66 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
67
68 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
69 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
70 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
71 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
72
73 template <uint32_t TotalControlPoints>
74 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
75 {
76 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
77 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
78 // Each attribute has 4 components.
79
80 /// @todo Optimize this
81
82 float* pOutVec = (float*)verts;
83
84 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
85 {
86 uint32_t input_cp = primIndex * TotalControlPoints + cp;
87 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
88 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
89
90 // Loop over all components of the attribute
91 for (uint32_t i = 0; i < 4; ++i)
92 {
93 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
94 pOutVec[cp * 4 + i] = pInputVec[input_lane];
95 }
96 }
97 }
98
99 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
100 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
101 {
102 SetNextPaState(
103 pa,
104 PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
105 PaPatchListSingle<TotalControlPoints>);
106
107 return false;
108 }
109
110 template<uint32_t TotalControlPoints>
111 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
112 {
113 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
114 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
115 // Each attribute has 4 components.
116
117 /// @todo Optimize this
118
119 // Loop over all components of the attribute
120 for (uint32_t i = 0; i < 4; ++i)
121 {
122 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
123 {
124 float vec[KNOB_SIMD_WIDTH];
125 for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
126 {
127 uint32_t input_cp = lane * TotalControlPoints + cp;
128 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
129 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
130
131 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
132 vec[lane] = pInputVec[input_lane];
133 }
134 verts[cp][i] = _simd_loadu_ps(vec);
135 }
136 }
137
138 SetNextPaState(
139 pa,
140 PaPatchList<TotalControlPoints>,
141 PaPatchListSingle<TotalControlPoints>,
142 0,
143 KNOB_SIMD_WIDTH,
144 true);
145
146 return true;
147 }
148
149 #define PA_PATCH_LIST_TERMINATOR(N) \
150 template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
151 { return PaPatchListTerm<N>(pa, slot, verts); }
152 PA_PATCH_LIST_TERMINATOR(1)
153 PA_PATCH_LIST_TERMINATOR(2)
154 PA_PATCH_LIST_TERMINATOR(3)
155 PA_PATCH_LIST_TERMINATOR(4)
156 PA_PATCH_LIST_TERMINATOR(5)
157 PA_PATCH_LIST_TERMINATOR(6)
158 PA_PATCH_LIST_TERMINATOR(7)
159 PA_PATCH_LIST_TERMINATOR(8)
160 PA_PATCH_LIST_TERMINATOR(9)
161 PA_PATCH_LIST_TERMINATOR(10)
162 PA_PATCH_LIST_TERMINATOR(11)
163 PA_PATCH_LIST_TERMINATOR(12)
164 PA_PATCH_LIST_TERMINATOR(13)
165 PA_PATCH_LIST_TERMINATOR(14)
166 PA_PATCH_LIST_TERMINATOR(15)
167 PA_PATCH_LIST_TERMINATOR(16)
168 PA_PATCH_LIST_TERMINATOR(17)
169 PA_PATCH_LIST_TERMINATOR(18)
170 PA_PATCH_LIST_TERMINATOR(19)
171 PA_PATCH_LIST_TERMINATOR(20)
172 PA_PATCH_LIST_TERMINATOR(21)
173 PA_PATCH_LIST_TERMINATOR(22)
174 PA_PATCH_LIST_TERMINATOR(23)
175 PA_PATCH_LIST_TERMINATOR(24)
176 PA_PATCH_LIST_TERMINATOR(25)
177 PA_PATCH_LIST_TERMINATOR(26)
178 PA_PATCH_LIST_TERMINATOR(27)
179 PA_PATCH_LIST_TERMINATOR(28)
180 PA_PATCH_LIST_TERMINATOR(29)
181 PA_PATCH_LIST_TERMINATOR(30)
182 PA_PATCH_LIST_TERMINATOR(31)
183 PA_PATCH_LIST_TERMINATOR(32)
184 #undef PA_PATCH_LIST_TERMINATOR
185
186 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
187 {
188 SetNextPaState(pa, PaTriList1, PaTriListSingle0);
189 return false; // Not enough vertices to assemble 4 or 8 triangles.
190 }
191
192 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
193 {
194 SetNextPaState(pa, PaTriList2, PaTriListSingle0);
195 return false; // Not enough vertices to assemble 8 triangles.
196 }
197
198 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
199 {
200 simdvector& a = PaGetSimdVector(pa, 0, slot);
201 simdvector& b = PaGetSimdVector(pa, 1, slot);
202 simdvector& c = PaGetSimdVector(pa, 2, slot);
203 simdscalar s;
204
205 // Tri Pattern - provoking vertex is always v0
206 // v0 -> 0 3 6 9 12 15 18 21
207 // v1 -> 1 4 7 10 13 16 19 22
208 // v2 -> 2 5 8 11 14 17 20 23
209
210 for(int i = 0; i < 4; ++i)
211 {
212 simdvector& v0 = verts[0];
213 v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
214 v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
215 v0[i] = _mm256_permute_ps(v0[i], 0x6C);
216 s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
217 v0[i] = _simd_blend_ps(v0[i], s, 0x44);
218
219 simdvector& v1 = verts[1];
220 v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
221 v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
222 v1[i] = _mm256_permute_ps(v1[i], 0xB1);
223 s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
224 v1[i] = _simd_blend_ps(v1[i], s, 0x66);
225
226 simdvector& v2 = verts[2];
227 v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
228 v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
229 v2[i] = _mm256_permute_ps(v2[i], 0xC6);
230 s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
231 v2[i] = _simd_blend_ps(v2[i], s, 0x22);
232 }
233
234 SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
235 return true;
236 }
237
238 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
239 {
240 // We have 12 simdscalars contained within 3 simdvectors which
241 // hold at least 8 triangles worth of data. We want to assemble a single
242 // triangle with data in horizontal form.
243 simdvector& a = PaGetSimdVector(pa, 0, slot);
244 simdvector& b = PaGetSimdVector(pa, 1, slot);
245 simdvector& c = PaGetSimdVector(pa, 2, slot);
246
247 // Convert from vertical to horizontal.
248 // Tri Pattern - provoking vertex is always v0
249 // v0 -> 0 3 6 9 12 15 18 21
250 // v1 -> 1 4 7 10 13 16 19 22
251 // v2 -> 2 5 8 11 14 17 20 23
252 switch(primIndex)
253 {
254 case 0:
255 verts[0] = swizzleLane0(a);
256 verts[1] = swizzleLane1(a);
257 verts[2] = swizzleLane2(a);
258 break;
259 case 1:
260 verts[0] = swizzleLane3(a);
261 verts[1] = swizzleLane4(a);
262 verts[2] = swizzleLane5(a);
263 break;
264 case 2:
265 verts[0] = swizzleLane6(a);
266 verts[1] = swizzleLane7(a);
267 verts[2] = swizzleLane0(b);
268 break;
269 case 3:
270 verts[0] = swizzleLane1(b);
271 verts[1] = swizzleLane2(b);
272 verts[2] = swizzleLane3(b);
273 break;
274 case 4:
275 verts[0] = swizzleLane4(b);
276 verts[1] = swizzleLane5(b);
277 verts[2] = swizzleLane6(b);
278 break;
279 case 5:
280 verts[0] = swizzleLane7(b);
281 verts[1] = swizzleLane0(c);
282 verts[2] = swizzleLane1(c);
283 break;
284 case 6:
285 verts[0] = swizzleLane2(c);
286 verts[1] = swizzleLane3(c);
287 verts[2] = swizzleLane4(c);
288 break;
289 case 7:
290 verts[0] = swizzleLane5(c);
291 verts[1] = swizzleLane6(c);
292 verts[2] = swizzleLane7(c);
293 break;
294 };
295 }
296
297 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
298 {
299 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
300 return false; // Not enough vertices to assemble 8 triangles.
301 }
302
303 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
304 {
305 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
306 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
307 simdscalar s;
308
309 for(int i = 0; i < 4; ++i)
310 {
311 simdscalar a0 = a[i];
312 simdscalar b0 = b[i];
313
314 // Tri Pattern - provoking vertex is always v0
315 // v0 -> 01234567
316 // v1 -> 13355779
317 // v2 -> 22446688
318 simdvector& v0 = verts[0];
319 v0[i] = a0;
320
321 // s -> 4567891011
322 s = _mm256_permute2f128_ps(a0, b0, 0x21);
323 // s -> 23456789
324 s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
325
326 simdvector& v1 = verts[1];
327 // v1 -> 13355779
328 v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
329
330 simdvector& v2 = verts[2];
331 // v2 -> 22446688
332 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
333 }
334
335 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
336 return true;
337 }
338
339 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
340 {
341 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
342 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
343
344 // Convert from vertical to horizontal.
345 // Tri Pattern - provoking vertex is always v0
346 // v0 -> 01234567
347 // v1 -> 13355779
348 // v2 -> 22446688
349 switch(primIndex)
350 {
351 case 0:
352 verts[0] = swizzleLane0(a);
353 verts[1] = swizzleLane1(a);
354 verts[2] = swizzleLane2(a);
355 break;
356 case 1:
357 verts[0] = swizzleLane1(a);
358 verts[1] = swizzleLane3(a);
359 verts[2] = swizzleLane2(a);
360 break;
361 case 2:
362 verts[0] = swizzleLane2(a);
363 verts[1] = swizzleLane3(a);
364 verts[2] = swizzleLane4(a);
365 break;
366 case 3:
367 verts[0] = swizzleLane3(a);
368 verts[1] = swizzleLane5(a);
369 verts[2] = swizzleLane4(a);
370 break;
371 case 4:
372 verts[0] = swizzleLane4(a);
373 verts[1] = swizzleLane5(a);
374 verts[2] = swizzleLane6(a);
375 break;
376 case 5:
377 verts[0] = swizzleLane5(a);
378 verts[1] = swizzleLane7(a);
379 verts[2] = swizzleLane6(a);
380 break;
381 case 6:
382 verts[0] = swizzleLane6(a);
383 verts[1] = swizzleLane7(a);
384 verts[2] = swizzleLane0(b);
385 break;
386 case 7:
387 verts[0] = swizzleLane7(a);
388 verts[1] = swizzleLane1(b);
389 verts[2] = swizzleLane0(b);
390 break;
391 };
392 }
393
394 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
395 {
396 simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
397
398 // Extract vertex 0 to every lane of first vector
399 for(int i = 0; i < 4; ++i)
400 {
401 __m256 a0 = a[i];
402 simdvector& v0 = verts[0];
403 v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
404 v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
405 }
406
407 // store off leading vertex for attributes
408 simdvertex* pVertex = (simdvertex*)pa.pStreamBase;
409 pa.leadingVertex = pVertex[pa.cur];
410
411 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
412 return false; // Not enough vertices to assemble 8 triangles.
413 }
414
415 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
416 {
417 simdvector& leadVert = pa.leadingVertex.attrib[slot];
418 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
419 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
420 simdscalar s;
421
422 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
423 for(int i = 0; i < 4; ++i)
424 {
425 simdscalar a0 = a[i];
426 simdscalar b0 = b[i];
427
428 __m256 comp = leadVert[i];
429 simdvector& v0 = verts[0];
430 v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
431 v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
432
433 simdvector& v2 = verts[2];
434 s = _mm256_permute2f128_ps(a0, b0, 0x21);
435 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
436
437 simdvector& v1 = verts[1];
438 v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
439 }
440
441 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
442 return true;
443 }
444
445 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
446 {
447 // vert 0 from leading vertex
448 simdvector& lead = pa.leadingVertex.attrib[slot];
449 verts[0] = swizzleLane0(lead);
450
451 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
452 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
453
454 // vert 1
455 if (primIndex < 7)
456 {
457 verts[1] = swizzleLaneN(a, primIndex + 1);
458 }
459 else
460 {
461 verts[1] = swizzleLane0(b);
462 }
463
464 // vert 2
465 if (primIndex < 6)
466 {
467 verts[2] = swizzleLaneN(a, primIndex + 2);
468 }
469 else
470 {
471 verts[2] = swizzleLaneN(b, primIndex - 6);
472 }
473 }
474
475 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
476 {
477 SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
478 return false; // Not enough vertices to assemble 8 triangles.
479 }
480
481 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
482 {
483 simdvector& a = PaGetSimdVector(pa, 0, slot);
484 simdvector& b = PaGetSimdVector(pa, 1, slot);
485 simdscalar s1, s2;
486
487 for(int i = 0; i < 4; ++i)
488 {
489 simdscalar a0 = a[i];
490 simdscalar b0 = b[i];
491
492 s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
493 s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
494
495 simdvector& v0 = verts[0];
496 v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
497
498 simdvector& v1 = verts[1];
499 v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
500
501 simdvector& v2 = verts[2];
502 v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
503 }
504
505 SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true);
506 return true;
507 }
508
509 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
510 {
511 simdvector& a = PaGetSimdVector(pa, 0, slot);
512 simdvector& b = PaGetSimdVector(pa, 1, slot);
513
514 switch (primIndex)
515 {
516 case 0:
517 // triangle 0 - 0 1 2
518 verts[0] = swizzleLane0(a);
519 verts[1] = swizzleLane1(a);
520 verts[2] = swizzleLane2(a);
521 break;
522
523 case 1:
524 // triangle 1 - 0 2 3
525 verts[0] = swizzleLane0(a);
526 verts[1] = swizzleLane2(a);
527 verts[2] = swizzleLane3(a);
528 break;
529
530 case 2:
531 // triangle 2 - 4 5 6
532 verts[0] = swizzleLane4(a);
533 verts[1] = swizzleLane5(a);
534 verts[2] = swizzleLane6(a);
535 break;
536
537 case 3:
538 // triangle 3 - 4 6 7
539 verts[0] = swizzleLane4(a);
540 verts[1] = swizzleLane6(a);
541 verts[2] = swizzleLane7(a);
542 break;
543
544 case 4:
545 // triangle 4 - 8 9 10 (0 1 2)
546 verts[0] = swizzleLane0(b);
547 verts[1] = swizzleLane1(b);
548 verts[2] = swizzleLane2(b);
549 break;
550
551 case 5:
552 // triangle 1 - 0 2 3
553 verts[0] = swizzleLane0(b);
554 verts[1] = swizzleLane2(b);
555 verts[2] = swizzleLane3(b);
556 break;
557
558 case 6:
559 // triangle 2 - 4 5 6
560 verts[0] = swizzleLane4(b);
561 verts[1] = swizzleLane5(b);
562 verts[2] = swizzleLane6(b);
563 break;
564
565 case 7:
566 // triangle 3 - 4 6 7
567 verts[0] = swizzleLane4(b);
568 verts[1] = swizzleLane6(b);
569 verts[2] = swizzleLane7(b);
570 break;
571 }
572 }
573
574 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
575 {
576 PaLineStripSingle0(pa, slot, lineIndex, verts);
577
578 if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) {
579 simdvector &start = PaGetSimdVector(pa, pa.first, slot);
580 verts[1] = swizzleLane0(start);
581 }
582 }
583
584 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
585 {
586 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
587 return false;
588 }
589
590 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
591 {
592 PaLineStrip1(pa, slot, verts);
593
594 if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) {
595 // loop reconnect now
596 int lane = pa.numPrims - pa.numPrimsComplete - 1;
597 simdvector &start = PaGetSimdVector(pa, pa.first, slot);
598 for (int i = 0; i < 4; i++) {
599 float *startVtx = (float *)&(start[i]);
600 float *targetVtx = (float *)&(verts[1][i]);
601 targetVtx[lane] = startVtx[0];
602 }
603 }
604
605 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
606 return true;
607 }
608
609
610 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
611 {
612 SetNextPaState(pa, PaLineList1, PaLineListSingle0);
613 return false; // Not enough vertices to assemble 8 lines
614 }
615
616 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
617 {
618 simdvector& a = PaGetSimdVector(pa, 0, slot);
619 simdvector& b = PaGetSimdVector(pa, 1, slot);
620 /// @todo: verify provoking vertex is correct
621 // Line list 0 1 2 3 4 5 6 7
622 // 8 9 10 11 12 13 14 15
623
624 // shuffle:
625 // 0 2 4 6 8 10 12 14
626 // 1 3 5 7 9 11 13 15
627
628 for (uint32_t i = 0; i < 4; ++i)
629 {
630 // 0 1 2 3 8 9 10 11
631 __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
632 // 4 5 6 7 12 13 14 15
633 __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
634
635 // 0 2 4 6 8 10 12 14
636 verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
637 // 1 3 5 7 9 11 13 15
638 verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
639 }
640
641 SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true);
642 return true;
643 }
644
645 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
646 {
647 simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
648 simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
649
650 switch (primIndex)
651 {
652 case 0:
653 verts[0] = swizzleLane0(a);
654 verts[1] = swizzleLane1(a);
655 break;
656 case 1:
657 verts[0] = swizzleLane2(a);
658 verts[1] = swizzleLane3(a);
659 break;
660 case 2:
661 verts[0] = swizzleLane4(a);
662 verts[1] = swizzleLane5(a);
663 break;
664 case 3:
665 verts[0] = swizzleLane6(a);
666 verts[1] = swizzleLane7(a);
667 break;
668 case 4:
669 verts[0] = swizzleLane0(b);
670 verts[1] = swizzleLane1(b);
671 break;
672 case 5:
673 verts[0] = swizzleLane2(b);
674 verts[1] = swizzleLane3(b);
675 break;
676 case 6:
677 verts[0] = swizzleLane4(b);
678 verts[1] = swizzleLane5(b);
679 break;
680 case 7:
681 verts[0] = swizzleLane6(b);
682 verts[1] = swizzleLane7(b);
683 break;
684 }
685 }
686
687 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
688 {
689 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
690 return false; // Not enough vertices to assemble 8 lines
691 }
692
693 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
694 {
695 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
696 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
697
698 /// @todo: verify provoking vertex is correct
699 // Line list 0 1 2 3 4 5 6 7
700 // 8 9 10 11 12 13 14 15
701
702 // shuffle:
703 // 0 1 2 3 4 5 6 7
704 // 1 2 3 4 5 6 7 8
705
706 verts[0] = a;
707
708 for(uint32_t i = 0; i < 4; ++i)
709 {
710 // 1 2 3 x 5 6 7 x
711 __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
712 // 4 5 6 7 8 9 10 11
713 __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
714
715 // x x x 4 x x x 8
716 __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0)
717
718 verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
719 }
720
721 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH);
722 return true;
723 }
724
725 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
726 {
727 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
728 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
729
730 switch (lineIndex)
731 {
732 case 0:
733 verts[0] = swizzleLane0(a);
734 verts[1] = swizzleLane1(a);
735 break;
736 case 1:
737 verts[0] = swizzleLane1(a);
738 verts[1] = swizzleLane2(a);
739 break;
740 case 2:
741 verts[0] = swizzleLane2(a);
742 verts[1] = swizzleLane3(a);
743 break;
744 case 3:
745 verts[0] = swizzleLane3(a);
746 verts[1] = swizzleLane4(a);
747 break;
748 case 4:
749 verts[0] = swizzleLane4(a);
750 verts[1] = swizzleLane5(a);
751 break;
752 case 5:
753 verts[0] = swizzleLane5(a);
754 verts[1] = swizzleLane6(a);
755 break;
756 case 6:
757 verts[0] = swizzleLane6(a);
758 verts[1] = swizzleLane7(a);
759 break;
760 case 7:
761 verts[0] = swizzleLane7(a);
762 verts[1] = swizzleLane0(b);
763 break;
764 }
765 }
766
767 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
768 {
769 simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
770
771 verts[0] = a; // points only have 1 vertex.
772
773 SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true);
774 return true;
775 }
776
777 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
778 {
779 simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
780 switch(primIndex)
781 {
782 case 0:
783 verts[0] = swizzleLane0(a);
784 break;
785 case 1:
786 verts[0] = swizzleLane1(a);
787 break;
788 case 2:
789 verts[0] = swizzleLane2(a);
790 break;
791 case 3:
792 verts[0] = swizzleLane3(a);
793 break;
794 case 4:
795 verts[0] = swizzleLane4(a);
796 break;
797 case 5:
798 verts[0] = swizzleLane5(a);
799 break;
800 case 6:
801 verts[0] = swizzleLane6(a);
802 break;
803 case 7:
804 verts[0] = swizzleLane7(a);
805 break;
806 }
807 }
808
809 //////////////////////////////////////////////////////////////////////////
810 /// @brief State 1 for RECT_LIST topology.
811 /// There is not enough to assemble 8 triangles.
812 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
813 {
814 SetNextPaState(pa, PaRectList1, PaRectListSingle0);
815 return false;
816 }
817
818 //////////////////////////////////////////////////////////////////////////
819 /// @brief State 1 for RECT_LIST topology.
820 /// Rect lists has the following format.
821 /// w x y z
822 /// v2 o---o v5 o---o v8 o---o v11 o---o
823 /// | \ | | \ | | \ | | \ |
824 /// v1 o---o v4 o---o v7 o---o v10 o---o
825 /// v0 v3 v6 v9
826 ///
827 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
828 ///
829 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
830 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
831 /// etc.
832 ///
833 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
834 /// where v0 contains all the first vertices for 8 triangles.
835 ///
836 /// Result:
837 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
838 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
839 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
840 ///
841 /// @param pa - State for PA state machine.
842 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
843 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
844 bool PaRectList1(
845 PA_STATE_OPT& pa,
846 uint32_t slot,
847 simdvector verts[])
848 {
849 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
850 simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
851 simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
852
853 __m256 tmp0, tmp1, tmp2;
854
855 // Loop over each component in the simdvector.
856 for(int i = 0; i < 4; ++i)
857 {
858 simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
859 tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
860 v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
861 tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
862 v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
863 v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
864
865 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
866 /// AVX2 should make this much cheaper.
867 simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
868 v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
869 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
870 tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
871 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
872 v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
873 v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
874 v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
875
876 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
877 simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
878 v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
879 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
880 v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
881
882 // Need to compute 4th implied vertex for the rectangle.
883 tmp2 = _mm256_sub_ps(v0[i], v1[i]);
884 tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * }
885 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
886 v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
887 }
888
889 SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
890 return true;
891 }
892
893 //////////////////////////////////////////////////////////////////////////
894 /// @brief State 2 for RECT_LIST topology.
895 /// Not implemented unless there is a use case for more then 8 rects.
896 /// @param pa - State for PA state machine.
897 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
898 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
899 bool PaRectList2(
900 PA_STATE_OPT& pa,
901 uint32_t slot,
902 simdvector verts[])
903 {
904 SWR_ASSERT(0); // Is rect list used for anything other then clears?
905 SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
906 return true;
907 }
908
909 //////////////////////////////////////////////////////////////////////////
910 /// @brief This procedure is called by the Binner to assemble the attributes.
911 /// Unlike position, which is stored vertically, the attributes are
912 /// stored horizontally. The outputs from the VS, labeled as 'a' and
913 /// 'b' are vertical. This function needs to transpose the lanes
914 /// containing the vertical attribute data into horizontal form.
915 /// @param pa - State for PA state machine.
916 /// @param slot - Index into VS output for a given attribute.
917 /// @param primIndex - Binner processes each triangle individually.
918 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
919 void PaRectListSingle0(
920 PA_STATE_OPT& pa,
921 uint32_t slot,
922 uint32_t primIndex,
923 __m128 verts[])
924 {
925 // We have 12 simdscalars contained within 3 simdvectors which
926 // hold at least 8 triangles worth of data. We want to assemble a single
927 // triangle with data in horizontal form.
928 simdvector& a = PaGetSimdVector(pa, 0, slot);
929
930 // Convert from vertical to horizontal.
931 switch(primIndex)
932 {
933 case 0:
934 verts[0] = swizzleLane0(a);
935 verts[1] = swizzleLane1(a);
936 verts[2] = swizzleLane2(a);
937 break;
938 case 1:
939 verts[0] = swizzleLane0(a);
940 verts[1] = swizzleLane2(a);
941 verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2);
942 break;
943 case 2:
944 case 3:
945 case 4:
946 case 5:
947 case 6:
948 case 7:
949 SWR_ASSERT(0);
950 break;
951 };
952 }
953
954 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
955 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
956 cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
957 {
958 const API_STATE& state = GetApiState(pDC);
959
960 this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
961
962 switch (this->binTopology)
963 {
964 case TOP_TRIANGLE_LIST:
965 this->pfnPaFunc = PaTriList0;
966 break;
967 case TOP_TRIANGLE_STRIP:
968 this->pfnPaFunc = PaTriStrip0;
969 break;
970 case TOP_TRIANGLE_FAN:
971 this->pfnPaFunc = PaTriFan0;
972 break;
973 case TOP_QUAD_LIST:
974 this->pfnPaFunc = PaQuadList0;
975 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
976 break;
977 case TOP_QUAD_STRIP:
978 // quad strip pattern when decomposed into triangles is the same as verts strips
979 this->pfnPaFunc = PaTriStrip0;
980 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
981 break;
982 case TOP_LINE_LIST:
983 this->pfnPaFunc = PaLineList0;
984 this->numPrims = in_numPrims;
985 break;
986 case TOP_LINE_STRIP:
987 this->pfnPaFunc = PaLineStrip0;
988 this->numPrims = in_numPrims;
989 break;
990 case TOP_LINE_LOOP:
991 this->pfnPaFunc = PaLineLoop0;
992 this->numPrims = in_numPrims;
993 break;
994 case TOP_POINT_LIST:
995 // use point binner and rasterizer if supported
996 this->pfnPaFunc = PaPoints0;
997 this->numPrims = in_numPrims;
998 break;
999 case TOP_RECT_LIST:
1000 this->pfnPaFunc = PaRectList0;
1001 this->numPrims = in_numPrims * 2;
1002 break;
1003
1004 case TOP_PATCHLIST_1:
1005 this->pfnPaFunc = PaPatchList<1>;
1006 break;
1007 case TOP_PATCHLIST_2:
1008 this->pfnPaFunc = PaPatchList<2>;
1009 break;
1010 case TOP_PATCHLIST_3:
1011 this->pfnPaFunc = PaPatchList<3>;
1012 break;
1013 case TOP_PATCHLIST_4:
1014 this->pfnPaFunc = PaPatchList<4>;
1015 break;
1016 case TOP_PATCHLIST_5:
1017 this->pfnPaFunc = PaPatchList<5>;
1018 break;
1019 case TOP_PATCHLIST_6:
1020 this->pfnPaFunc = PaPatchList<6>;
1021 break;
1022 case TOP_PATCHLIST_7:
1023 this->pfnPaFunc = PaPatchList<7>;
1024 break;
1025 case TOP_PATCHLIST_8:
1026 this->pfnPaFunc = PaPatchList<8>;
1027 break;
1028 case TOP_PATCHLIST_9:
1029 this->pfnPaFunc = PaPatchList<9>;
1030 break;
1031 case TOP_PATCHLIST_10:
1032 this->pfnPaFunc = PaPatchList<10>;
1033 break;
1034 case TOP_PATCHLIST_11:
1035 this->pfnPaFunc = PaPatchList<11>;
1036 break;
1037 case TOP_PATCHLIST_12:
1038 this->pfnPaFunc = PaPatchList<12>;
1039 break;
1040 case TOP_PATCHLIST_13:
1041 this->pfnPaFunc = PaPatchList<13>;
1042 break;
1043 case TOP_PATCHLIST_14:
1044 this->pfnPaFunc = PaPatchList<14>;
1045 break;
1046 case TOP_PATCHLIST_15:
1047 this->pfnPaFunc = PaPatchList<15>;
1048 break;
1049 case TOP_PATCHLIST_16:
1050 this->pfnPaFunc = PaPatchList<16>;
1051 break;
1052 case TOP_PATCHLIST_17:
1053 this->pfnPaFunc = PaPatchList<17>;
1054 break;
1055 case TOP_PATCHLIST_18:
1056 this->pfnPaFunc = PaPatchList<18>;
1057 break;
1058 case TOP_PATCHLIST_19:
1059 this->pfnPaFunc = PaPatchList<19>;
1060 break;
1061 case TOP_PATCHLIST_20:
1062 this->pfnPaFunc = PaPatchList<20>;
1063 break;
1064 case TOP_PATCHLIST_21:
1065 this->pfnPaFunc = PaPatchList<21>;
1066 break;
1067 case TOP_PATCHLIST_22:
1068 this->pfnPaFunc = PaPatchList<22>;
1069 break;
1070 case TOP_PATCHLIST_23:
1071 this->pfnPaFunc = PaPatchList<23>;
1072 break;
1073 case TOP_PATCHLIST_24:
1074 this->pfnPaFunc = PaPatchList<24>;
1075 break;
1076 case TOP_PATCHLIST_25:
1077 this->pfnPaFunc = PaPatchList<25>;
1078 break;
1079 case TOP_PATCHLIST_26:
1080 this->pfnPaFunc = PaPatchList<26>;
1081 break;
1082 case TOP_PATCHLIST_27:
1083 this->pfnPaFunc = PaPatchList<27>;
1084 break;
1085 case TOP_PATCHLIST_28:
1086 this->pfnPaFunc = PaPatchList<28>;
1087 break;
1088 case TOP_PATCHLIST_29:
1089 this->pfnPaFunc = PaPatchList<29>;
1090 break;
1091 case TOP_PATCHLIST_30:
1092 this->pfnPaFunc = PaPatchList<30>;
1093 break;
1094 case TOP_PATCHLIST_31:
1095 this->pfnPaFunc = PaPatchList<31>;
1096 break;
1097 case TOP_PATCHLIST_32:
1098 this->pfnPaFunc = PaPatchList<32>;
1099 break;
1100
1101 default:
1102 SWR_ASSERT(0);
1103 break;
1104 };
1105
1106 this->pfnPaFuncReset = this->pfnPaFunc;
1107
1108 // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1109 // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
1110 simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1111 simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
1112
1113 switch(this->binTopology)
1114 {
1115 case TOP_TRIANGLE_LIST:
1116 case TOP_TRIANGLE_STRIP:
1117 case TOP_TRIANGLE_FAN:
1118 case TOP_LINE_STRIP:
1119 case TOP_LINE_LIST:
1120 case TOP_LINE_LOOP:
1121 this->primIDIncr = 8;
1122 this->primID = id8;
1123 break;
1124 case TOP_QUAD_LIST:
1125 case TOP_QUAD_STRIP:
1126 case TOP_RECT_LIST:
1127 this->primIDIncr = 4;
1128 this->primID = id4;
1129 break;
1130 case TOP_POINT_LIST:
1131 this->primIDIncr = 8;
1132 this->primID = id8;
1133 break;
1134 case TOP_PATCHLIST_1:
1135 case TOP_PATCHLIST_2:
1136 case TOP_PATCHLIST_3:
1137 case TOP_PATCHLIST_4:
1138 case TOP_PATCHLIST_5:
1139 case TOP_PATCHLIST_6:
1140 case TOP_PATCHLIST_7:
1141 case TOP_PATCHLIST_8:
1142 case TOP_PATCHLIST_9:
1143 case TOP_PATCHLIST_10:
1144 case TOP_PATCHLIST_11:
1145 case TOP_PATCHLIST_12:
1146 case TOP_PATCHLIST_13:
1147 case TOP_PATCHLIST_14:
1148 case TOP_PATCHLIST_15:
1149 case TOP_PATCHLIST_16:
1150 case TOP_PATCHLIST_17:
1151 case TOP_PATCHLIST_18:
1152 case TOP_PATCHLIST_19:
1153 case TOP_PATCHLIST_20:
1154 case TOP_PATCHLIST_21:
1155 case TOP_PATCHLIST_22:
1156 case TOP_PATCHLIST_23:
1157 case TOP_PATCHLIST_24:
1158 case TOP_PATCHLIST_25:
1159 case TOP_PATCHLIST_26:
1160 case TOP_PATCHLIST_27:
1161 case TOP_PATCHLIST_28:
1162 case TOP_PATCHLIST_29:
1163 case TOP_PATCHLIST_30:
1164 case TOP_PATCHLIST_31:
1165 case TOP_PATCHLIST_32:
1166 // Always run KNOB_SIMD_WIDTH number of patches at a time.
1167 this->primIDIncr = 8;
1168 this->primID = id8;
1169 break;
1170
1171 default:
1172 SWR_ASSERT(0);
1173 break;
1174 };
1175
1176 }
1177 #endif