a8d8379297ef5d7511b3cb7e8d3f546be7fcdb7a
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa_avx.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa_avx.cpp
24 *
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34
35 #if (KNOB_SIMD_WIDTH == 8)
36
37 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
38 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
39 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
40 #if ENABLE_AVX512_SIMD16
41 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
42 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
43 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
44 #endif
45 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
46
47 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
48 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
49 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
50
51 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
52 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
53 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
54
55 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
56 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
57 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
58
59 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
60 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
61
62 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
63 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
64 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]);
65
66 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
67 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
68 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]);
69
70 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
71 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
72
73 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
74 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
75 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
76 #if ENABLE_AVX512_SIMD16
77 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
78 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
79 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
80 #endif
81 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
82
83 template <uint32_t TotalControlPoints>
84 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
85 {
86 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
87 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
88 // Each attribute has 4 components.
89
90 /// @todo Optimize this
91
92 float* pOutVec = (float*)verts;
93
94 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
95 {
96 uint32_t input_cp = primIndex * TotalControlPoints + cp;
97 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
98 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
99
100 // Loop over all components of the attribute
101 for (uint32_t i = 0; i < 4; ++i)
102 {
103 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
104 pOutVec[cp * 4 + i] = pInputVec[input_lane];
105 }
106 }
107 }
108
109 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
110 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
111 {
112 SetNextPaState(
113 pa,
114 PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
115 PaPatchListSingle<TotalControlPoints>);
116
117 return false;
118 }
119
120 template<uint32_t TotalControlPoints>
121 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
122 {
123 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
124 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
125 // Each attribute has 4 components.
126
127 /// @todo Optimize this
128
129 // Loop over all components of the attribute
130 for (uint32_t i = 0; i < 4; ++i)
131 {
132 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
133 {
134 float vec[KNOB_SIMD_WIDTH];
135 for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
136 {
137 uint32_t input_cp = lane * TotalControlPoints + cp;
138 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
139 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
140
141 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
142 vec[lane] = pInputVec[input_lane];
143 }
144 verts[cp][i] = _simd_loadu_ps(vec);
145 }
146 }
147
148 SetNextPaState(
149 pa,
150 PaPatchList<TotalControlPoints>,
151 PaPatchListSingle<TotalControlPoints>,
152 0,
153 KNOB_SIMD_WIDTH,
154 true);
155
156 return true;
157 }
158
159 #define PA_PATCH_LIST_TERMINATOR(N) \
160 template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
161 { return PaPatchListTerm<N>(pa, slot, verts); }
162 PA_PATCH_LIST_TERMINATOR(1)
163 PA_PATCH_LIST_TERMINATOR(2)
164 PA_PATCH_LIST_TERMINATOR(3)
165 PA_PATCH_LIST_TERMINATOR(4)
166 PA_PATCH_LIST_TERMINATOR(5)
167 PA_PATCH_LIST_TERMINATOR(6)
168 PA_PATCH_LIST_TERMINATOR(7)
169 PA_PATCH_LIST_TERMINATOR(8)
170 PA_PATCH_LIST_TERMINATOR(9)
171 PA_PATCH_LIST_TERMINATOR(10)
172 PA_PATCH_LIST_TERMINATOR(11)
173 PA_PATCH_LIST_TERMINATOR(12)
174 PA_PATCH_LIST_TERMINATOR(13)
175 PA_PATCH_LIST_TERMINATOR(14)
176 PA_PATCH_LIST_TERMINATOR(15)
177 PA_PATCH_LIST_TERMINATOR(16)
178 PA_PATCH_LIST_TERMINATOR(17)
179 PA_PATCH_LIST_TERMINATOR(18)
180 PA_PATCH_LIST_TERMINATOR(19)
181 PA_PATCH_LIST_TERMINATOR(20)
182 PA_PATCH_LIST_TERMINATOR(21)
183 PA_PATCH_LIST_TERMINATOR(22)
184 PA_PATCH_LIST_TERMINATOR(23)
185 PA_PATCH_LIST_TERMINATOR(24)
186 PA_PATCH_LIST_TERMINATOR(25)
187 PA_PATCH_LIST_TERMINATOR(26)
188 PA_PATCH_LIST_TERMINATOR(27)
189 PA_PATCH_LIST_TERMINATOR(28)
190 PA_PATCH_LIST_TERMINATOR(29)
191 PA_PATCH_LIST_TERMINATOR(30)
192 PA_PATCH_LIST_TERMINATOR(31)
193 PA_PATCH_LIST_TERMINATOR(32)
194 #undef PA_PATCH_LIST_TERMINATOR
195
196 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
197 {
198 SetNextPaState(pa, PaTriList1, PaTriListSingle0);
199 return false; // Not enough vertices to assemble 4 or 8 triangles.
200 }
201
202 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
203 {
204 SetNextPaState(pa, PaTriList2, PaTriListSingle0);
205 return false; // Not enough vertices to assemble 8 triangles.
206 }
207
208 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
209 {
210 #if KNOB_ARCH == KNOB_ARCH_AVX
211
212 simdvector& a = PaGetSimdVector(pa, 0, slot);
213 simdvector& b = PaGetSimdVector(pa, 1, slot);
214 simdvector& c = PaGetSimdVector(pa, 2, slot);
215 simdscalar s;
216
217 // Tri Pattern - provoking vertex is always v0
218 // v0 -> 0 3 6 9 12 15 18 21
219 // v1 -> 1 4 7 10 13 16 19 22
220 // v2 -> 2 5 8 11 14 17 20 23
221
222 for (int i = 0; i < 4; ++i)
223 {
224 simdvector& v0 = verts[0];
225 v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
226 v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
227 v0[i] = _mm256_permute_ps(v0[i], 0x6C);
228 s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
229 v0[i] = _simd_blend_ps(v0[i], s, 0x44);
230
231 simdvector& v1 = verts[1];
232 v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
233 v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
234 v1[i] = _mm256_permute_ps(v1[i], 0xB1);
235 s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
236 v1[i] = _simd_blend_ps(v1[i], s, 0x66);
237
238 simdvector& v2 = verts[2];
239 v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
240 v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
241 v2[i] = _mm256_permute_ps(v2[i], 0xC6);
242 s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
243 v2[i] = _simd_blend_ps(v2[i], s, 0x22);
244 }
245
246 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
247
248 const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
249 const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
250 const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
251
252 const simdvector &a = PaGetSimdVector(pa, 0, slot);
253 const simdvector &b = PaGetSimdVector(pa, 1, slot);
254 const simdvector &c = PaGetSimdVector(pa, 2, slot);
255
256 // v0 -> a0 a3 a6 b1 b4 b7 c2 c5
257 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6
258 // v2 -> a2 a5 b0 b3 b6 c1 c4 c7
259
260 simdvector &v0 = verts[0];
261 simdvector &v1 = verts[1];
262 simdvector &v2 = verts[2];
263
264 // for simd x, y, z, and w
265 for (int i = 0; i < 4; ++i)
266 {
267 v0[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
268 v0[i] = _simd_permute_ps(v0[i], perm0);
269
270 v1[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
271 v1[i] = _simd_permute_ps(v1[i], perm1);
272
273 v2[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
274 v2[i] = _simd_permute_ps(v2[i], perm2);
275 }
276
277 #endif
278
279 SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
280 return true;
281 }
282
283 #if ENABLE_AVX512_SIMD16
284 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
285 {
286 SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriListSingle0);
287 return false; // Not enough vertices to assemble 16 triangles
288 }
289
290 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
291 {
292 SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriListSingle0);
293 return false; // Not enough vertices to assemble 16 triangles
294 }
295
296 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
297 {
298 const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0);
299 const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1);
300 const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2);
301
302 const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
303 const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
304 const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
305
306 simd16vector &v0 = verts[0];
307 simd16vector &v1 = verts[1];
308 simd16vector &v2 = verts[2];
309
310 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
311 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
312 // v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
313
314 // for simd16 x, y, z, and w
315 for (int i = 0; i < 4; i += 1)
316 {
317 v0[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x4924), c[i], 0x2492);
318 v0[i] = _simd16_permute_ps(v0[i], perm0);
319
320 v1[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x9249), c[i], 0x4924);
321 v1[i] = _simd16_permute_ps(v1[i], perm1);
322
323 v2[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x2492), c[i], 0x9249);
324 v2[i] = _simd16_permute_ps(v2[i], perm2);
325 }
326
327 SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true);
328 return true;
329 }
330
331 #endif
332 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
333 {
334 // We have 12 simdscalars contained within 3 simdvectors which
335 // hold at least 8 triangles worth of data. We want to assemble a single
336 // triangle with data in horizontal form.
337 #if USE_SIMD16_FRONTEND
338 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
339 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
340 const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
341
342 simdvector a;
343 simdvector b;
344 simdvector c;
345
346 for (uint32_t i = 0; i < 4; i += 1)
347 {
348 if (pa.useAlternateOffset)
349 {
350 a[i] = b_16[i].hi;
351 b[i] = c_16[i].lo;
352 c[i] = c_16[i].hi;
353 }
354 else
355 {
356 a[i] = a_16[i].lo;
357 b[i] = a_16[i].hi;
358 c[i] = b_16[i].lo;
359 }
360 }
361
362 #else
363 simdvector& a = PaGetSimdVector(pa, 0, slot);
364 simdvector& b = PaGetSimdVector(pa, 1, slot);
365 simdvector& c = PaGetSimdVector(pa, 2, slot);
366
367 #endif
368 // Convert from vertical to horizontal.
369 // Tri Pattern - provoking vertex is always v0
370 // v0 -> 0 3 6 9 12 15 18 21
371 // v1 -> 1 4 7 10 13 16 19 22
372 // v2 -> 2 5 8 11 14 17 20 23
373 switch(primIndex)
374 {
375 case 0:
376 verts[0] = swizzleLane0(a);
377 verts[1] = swizzleLane1(a);
378 verts[2] = swizzleLane2(a);
379 break;
380 case 1:
381 verts[0] = swizzleLane3(a);
382 verts[1] = swizzleLane4(a);
383 verts[2] = swizzleLane5(a);
384 break;
385 case 2:
386 verts[0] = swizzleLane6(a);
387 verts[1] = swizzleLane7(a);
388 verts[2] = swizzleLane0(b);
389 break;
390 case 3:
391 verts[0] = swizzleLane1(b);
392 verts[1] = swizzleLane2(b);
393 verts[2] = swizzleLane3(b);
394 break;
395 case 4:
396 verts[0] = swizzleLane4(b);
397 verts[1] = swizzleLane5(b);
398 verts[2] = swizzleLane6(b);
399 break;
400 case 5:
401 verts[0] = swizzleLane7(b);
402 verts[1] = swizzleLane0(c);
403 verts[2] = swizzleLane1(c);
404 break;
405 case 6:
406 verts[0] = swizzleLane2(c);
407 verts[1] = swizzleLane3(c);
408 verts[2] = swizzleLane4(c);
409 break;
410 case 7:
411 verts[0] = swizzleLane5(c);
412 verts[1] = swizzleLane6(c);
413 verts[2] = swizzleLane7(c);
414 break;
415 };
416 }
417
418 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
419 {
420 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
421 return false; // Not enough vertices to assemble 8 triangles.
422 }
423
424 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
425 {
426 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
427 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
428 simdscalar s;
429
430 for(int i = 0; i < 4; ++i)
431 {
432 simdscalar a0 = a[i];
433 simdscalar b0 = b[i];
434
435 // Tri Pattern - provoking vertex is always v0
436 // v0 -> 01234567
437 // v1 -> 13355779
438 // v2 -> 22446688
439 simdvector& v0 = verts[0];
440 v0[i] = a0;
441
442 // s -> 4567891011
443 s = _mm256_permute2f128_ps(a0, b0, 0x21);
444 // s -> 23456789
445 s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
446
447 simdvector& v1 = verts[1];
448 // v1 -> 13355779
449 v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
450
451 simdvector& v2 = verts[2];
452 // v2 -> 22446688
453 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
454 }
455
456 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
457 return true;
458 }
459
460 #if 0 // ENABLE_AVX512_SIMD16
461 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
462 {
463 const simd16vector &a = PaGetSimdVector(pa, pa.prev, slot);
464 const simd16vector &b = PaGetSimdVector(pa, pa.cur, slot);
465
466 simd16vector &v0 = verts[0];
467 simd16vector &v1 = verts[1];
468 simd16vector &v2 = verts[2];
469
470 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
471 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
472 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
473
474 // for simd16 x, y, z, and w
475 for (int i = 0; i < 4; i += 1)
476 {
477 simd16scalar perm0 = _simd16_permute2f128_ps(a[i], a[i], 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
478 simd16scalar perm1 = _simd16_permute2f128_ps(b[i], b[i], 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
479
480 simd16scalar blend = _simd16_blend_ps(perm0, perm1, 0xF000); // // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
481 simd16scalar shuff = _simd16_shuffle_ps(a[i], blend, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
482
483 v0[i] = a[i]; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
484 v1[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
485 v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
486 }
487
488 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH);
489 return true;
490 }
491
492 #endif
493 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
494 {
495 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
496 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
497
498 // Convert from vertical to horizontal.
499 // Tri Pattern - provoking vertex is always v0
500 // v0 -> 01234567
501 // v1 -> 13355779
502 // v2 -> 22446688
503 switch(primIndex)
504 {
505 case 0:
506 verts[0] = swizzleLane0(a);
507 verts[1] = swizzleLane1(a);
508 verts[2] = swizzleLane2(a);
509 break;
510 case 1:
511 verts[0] = swizzleLane1(a);
512 verts[1] = swizzleLane3(a);
513 verts[2] = swizzleLane2(a);
514 break;
515 case 2:
516 verts[0] = swizzleLane2(a);
517 verts[1] = swizzleLane3(a);
518 verts[2] = swizzleLane4(a);
519 break;
520 case 3:
521 verts[0] = swizzleLane3(a);
522 verts[1] = swizzleLane5(a);
523 verts[2] = swizzleLane4(a);
524 break;
525 case 4:
526 verts[0] = swizzleLane4(a);
527 verts[1] = swizzleLane5(a);
528 verts[2] = swizzleLane6(a);
529 break;
530 case 5:
531 verts[0] = swizzleLane5(a);
532 verts[1] = swizzleLane7(a);
533 verts[2] = swizzleLane6(a);
534 break;
535 case 6:
536 verts[0] = swizzleLane6(a);
537 verts[1] = swizzleLane7(a);
538 verts[2] = swizzleLane0(b);
539 break;
540 case 7:
541 verts[0] = swizzleLane7(a);
542 verts[1] = swizzleLane1(b);
543 verts[2] = swizzleLane0(b);
544 break;
545 };
546 }
547
548 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
549 {
550 simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
551
552 // Extract vertex 0 to every lane of first vector
553 for(int i = 0; i < 4; ++i)
554 {
555 __m256 a0 = a[i];
556 simdvector& v0 = verts[0];
557 v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
558 v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
559 }
560
561 // store off leading vertex for attributes
562 PA_STATE_OPT::SIMDVERTEX* pVertex = (PA_STATE_OPT::SIMDVERTEX*)pa.pStreamBase;
563 pa.leadingVertex = pVertex[pa.cur];
564
565 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
566 return false; // Not enough vertices to assemble 8 triangles.
567 }
568
569 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
570 {
571 PA_STATE_OPT::SIMDVECTOR& leadVert = pa.leadingVertex.attrib[slot];
572 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
573 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
574 simdscalar s;
575
576 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
577 for(int i = 0; i < 4; ++i)
578 {
579 simdscalar a0 = a[i];
580 simdscalar b0 = b[i];
581
582 #if USE_SIMD16_FRONTEND
583 __m256 comp = leadVert[i].lo;
584 #else
585 __m256 comp = leadVert[i];
586 #endif
587 simdvector& v0 = verts[0];
588 v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
589 v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
590
591 simdvector& v2 = verts[2];
592 s = _mm256_permute2f128_ps(a0, b0, 0x21);
593 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
594
595 simdvector& v1 = verts[1];
596 v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
597 }
598
599 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
600 return true;
601 }
602
603 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
604 {
605 // vert 0 from leading vertex
606 #if USE_SIMD16_FRONTEND
607 PA_STATE_OPT::SIMDVECTOR& temp = pa.leadingVertex.attrib[slot];
608
609 simdvector lead;
610 lead[0] = temp[0].lo;
611 lead[1] = temp[1].lo;
612 lead[2] = temp[2].lo;
613 lead[3] = temp[3].lo;
614 verts[0] = swizzleLane0(lead);
615 #else
616 PA_STATE_OPT::SIMDVECTOR& lead = pa.leadingVertex.attrib[slot];
617 verts[0] = swizzleLane0(lead);
618 #endif
619
620 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
621 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
622
623 // vert 1
624 if (primIndex < 7)
625 {
626 verts[1] = swizzleLaneN(a, primIndex + 1);
627 }
628 else
629 {
630 verts[1] = swizzleLane0(b);
631 }
632
633 // vert 2
634 if (primIndex < 6)
635 {
636 verts[2] = swizzleLaneN(a, primIndex + 2);
637 }
638 else
639 {
640 verts[2] = swizzleLaneN(b, primIndex - 6);
641 }
642 }
643
644 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
645 {
646 SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
647 return false; // Not enough vertices to assemble 8 triangles.
648 }
649
650 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
651 {
652 simdvector& a = PaGetSimdVector(pa, 0, slot);
653 simdvector& b = PaGetSimdVector(pa, 1, slot);
654 simdscalar s1, s2;
655
656 for(int i = 0; i < 4; ++i)
657 {
658 simdscalar a0 = a[i];
659 simdscalar b0 = b[i];
660
661 s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
662 s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
663
664 simdvector& v0 = verts[0];
665 v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
666
667 simdvector& v1 = verts[1];
668 v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
669
670 simdvector& v2 = verts[2];
671 v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
672 }
673
674 SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true);
675 return true;
676 }
677
678 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
679 {
680 simdvector& a = PaGetSimdVector(pa, 0, slot);
681 simdvector& b = PaGetSimdVector(pa, 1, slot);
682
683 switch (primIndex)
684 {
685 case 0:
686 // triangle 0 - 0 1 2
687 verts[0] = swizzleLane0(a);
688 verts[1] = swizzleLane1(a);
689 verts[2] = swizzleLane2(a);
690 break;
691
692 case 1:
693 // triangle 1 - 0 2 3
694 verts[0] = swizzleLane0(a);
695 verts[1] = swizzleLane2(a);
696 verts[2] = swizzleLane3(a);
697 break;
698
699 case 2:
700 // triangle 2 - 4 5 6
701 verts[0] = swizzleLane4(a);
702 verts[1] = swizzleLane5(a);
703 verts[2] = swizzleLane6(a);
704 break;
705
706 case 3:
707 // triangle 3 - 4 6 7
708 verts[0] = swizzleLane4(a);
709 verts[1] = swizzleLane6(a);
710 verts[2] = swizzleLane7(a);
711 break;
712
713 case 4:
714 // triangle 4 - 8 9 10 (0 1 2)
715 verts[0] = swizzleLane0(b);
716 verts[1] = swizzleLane1(b);
717 verts[2] = swizzleLane2(b);
718 break;
719
720 case 5:
721 // triangle 1 - 0 2 3
722 verts[0] = swizzleLane0(b);
723 verts[1] = swizzleLane2(b);
724 verts[2] = swizzleLane3(b);
725 break;
726
727 case 6:
728 // triangle 2 - 4 5 6
729 verts[0] = swizzleLane4(b);
730 verts[1] = swizzleLane5(b);
731 verts[2] = swizzleLane6(b);
732 break;
733
734 case 7:
735 // triangle 3 - 4 6 7
736 verts[0] = swizzleLane4(b);
737 verts[1] = swizzleLane6(b);
738 verts[2] = swizzleLane7(b);
739 break;
740 }
741 }
742
743 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
744 {
745 PaLineStripSingle0(pa, slot, lineIndex, verts);
746
747 if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) {
748 simdvector &start = PaGetSimdVector(pa, pa.first, slot);
749 verts[1] = swizzleLane0(start);
750 }
751 }
752
753 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
754 {
755 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
756 return false;
757 }
758
759 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
760 {
761 PaLineStrip1(pa, slot, verts);
762
763 if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) {
764 // loop reconnect now
765 int lane = pa.numPrims - pa.numPrimsComplete - 1;
766 simdvector &start = PaGetSimdVector(pa, pa.first, slot);
767 for (int i = 0; i < 4; i++) {
768 float *startVtx = (float *)&(start[i]);
769 float *targetVtx = (float *)&(verts[1][i]);
770 targetVtx[lane] = startVtx[0];
771 }
772 }
773
774 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
775 return true;
776 }
777
778
779 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
780 {
781 SetNextPaState(pa, PaLineList1, PaLineListSingle0);
782 return false; // Not enough vertices to assemble 8 lines
783 }
784
785 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
786 {
787 simdvector& a = PaGetSimdVector(pa, 0, slot);
788 simdvector& b = PaGetSimdVector(pa, 1, slot);
789 /// @todo: verify provoking vertex is correct
790 // Line list 0 1 2 3 4 5 6 7
791 // 8 9 10 11 12 13 14 15
792
793 // shuffle:
794 // 0 2 4 6 8 10 12 14
795 // 1 3 5 7 9 11 13 15
796
797 for (uint32_t i = 0; i < 4; ++i)
798 {
799 // 0 1 2 3 8 9 10 11
800 __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
801 // 4 5 6 7 12 13 14 15
802 __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
803
804 // 0 2 4 6 8 10 12 14
805 verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
806 // 1 3 5 7 9 11 13 15
807 verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
808 }
809
810 SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true);
811 return true;
812 }
813
814 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
815 {
816 simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
817 simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
818
819 switch (primIndex)
820 {
821 case 0:
822 verts[0] = swizzleLane0(a);
823 verts[1] = swizzleLane1(a);
824 break;
825 case 1:
826 verts[0] = swizzleLane2(a);
827 verts[1] = swizzleLane3(a);
828 break;
829 case 2:
830 verts[0] = swizzleLane4(a);
831 verts[1] = swizzleLane5(a);
832 break;
833 case 3:
834 verts[0] = swizzleLane6(a);
835 verts[1] = swizzleLane7(a);
836 break;
837 case 4:
838 verts[0] = swizzleLane0(b);
839 verts[1] = swizzleLane1(b);
840 break;
841 case 5:
842 verts[0] = swizzleLane2(b);
843 verts[1] = swizzleLane3(b);
844 break;
845 case 6:
846 verts[0] = swizzleLane4(b);
847 verts[1] = swizzleLane5(b);
848 break;
849 case 7:
850 verts[0] = swizzleLane6(b);
851 verts[1] = swizzleLane7(b);
852 break;
853 }
854 }
855
856 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
857 {
858 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
859 return false; // Not enough vertices to assemble 8 lines
860 }
861
862 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
863 {
864 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
865 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
866
867 /// @todo: verify provoking vertex is correct
868 // Line list 0 1 2 3 4 5 6 7
869 // 8 9 10 11 12 13 14 15
870
871 // shuffle:
872 // 0 1 2 3 4 5 6 7
873 // 1 2 3 4 5 6 7 8
874
875 verts[0] = a;
876
877 for(uint32_t i = 0; i < 4; ++i)
878 {
879 // 1 2 3 x 5 6 7 x
880 __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
881 // 4 5 6 7 8 9 10 11
882 __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
883
884 // x x x 4 x x x 8
885 __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0)
886
887 verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
888 }
889
890 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH);
891 return true;
892 }
893
894 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
895 {
896 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
897 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
898
899 switch (lineIndex)
900 {
901 case 0:
902 verts[0] = swizzleLane0(a);
903 verts[1] = swizzleLane1(a);
904 break;
905 case 1:
906 verts[0] = swizzleLane1(a);
907 verts[1] = swizzleLane2(a);
908 break;
909 case 2:
910 verts[0] = swizzleLane2(a);
911 verts[1] = swizzleLane3(a);
912 break;
913 case 3:
914 verts[0] = swizzleLane3(a);
915 verts[1] = swizzleLane4(a);
916 break;
917 case 4:
918 verts[0] = swizzleLane4(a);
919 verts[1] = swizzleLane5(a);
920 break;
921 case 5:
922 verts[0] = swizzleLane5(a);
923 verts[1] = swizzleLane6(a);
924 break;
925 case 6:
926 verts[0] = swizzleLane6(a);
927 verts[1] = swizzleLane7(a);
928 break;
929 case 7:
930 verts[0] = swizzleLane7(a);
931 verts[1] = swizzleLane0(b);
932 break;
933 }
934 }
935
936 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
937 {
938 simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
939
940 verts[0] = a; // points only have 1 vertex.
941
942 SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true);
943 return true;
944 }
945
946 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
947 {
948 simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
949 switch(primIndex)
950 {
951 case 0:
952 verts[0] = swizzleLane0(a);
953 break;
954 case 1:
955 verts[0] = swizzleLane1(a);
956 break;
957 case 2:
958 verts[0] = swizzleLane2(a);
959 break;
960 case 3:
961 verts[0] = swizzleLane3(a);
962 break;
963 case 4:
964 verts[0] = swizzleLane4(a);
965 break;
966 case 5:
967 verts[0] = swizzleLane5(a);
968 break;
969 case 6:
970 verts[0] = swizzleLane6(a);
971 break;
972 case 7:
973 verts[0] = swizzleLane7(a);
974 break;
975 }
976 }
977
978 //////////////////////////////////////////////////////////////////////////
979 /// @brief State 1 for RECT_LIST topology.
980 /// There is not enough to assemble 8 triangles.
981 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
982 {
983 SetNextPaState(pa, PaRectList1, PaRectListSingle0);
984 return false;
985 }
986
987 //////////////////////////////////////////////////////////////////////////
988 /// @brief State 1 for RECT_LIST topology.
989 /// Rect lists has the following format.
990 /// w x y z
991 /// v2 o---o v5 o---o v8 o---o v11 o---o
992 /// | \ | | \ | | \ | | \ |
993 /// v1 o---o v4 o---o v7 o---o v10 o---o
994 /// v0 v3 v6 v9
995 ///
996 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
997 ///
998 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
999 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
1000 /// etc.
1001 ///
1002 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
1003 /// where v0 contains all the first vertices for 8 triangles.
1004 ///
1005 /// Result:
1006 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
1007 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
1008 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
1009 ///
1010 /// @param pa - State for PA state machine.
1011 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1012 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1013 bool PaRectList1(
1014 PA_STATE_OPT& pa,
1015 uint32_t slot,
1016 simdvector verts[])
1017 {
1018 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
1019 simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
1020 simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
1021
1022 __m256 tmp0, tmp1, tmp2;
1023
1024 // Loop over each component in the simdvector.
1025 for(int i = 0; i < 4; ++i)
1026 {
1027 simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
1028 tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
1029 v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
1030 tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
1031 v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
1032 v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
1033
1034 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
1035 /// AVX2 should make this much cheaper.
1036 simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
1037 v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
1038 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
1039 tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
1040 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
1041 v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
1042 v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
1043 v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
1044
1045 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
1046 simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
1047 v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
1048 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
1049 v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
1050
1051 // Need to compute 4th implied vertex for the rectangle.
1052 tmp2 = _mm256_sub_ps(v0[i], v1[i]);
1053 tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * }
1054 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
1055 v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
1056 }
1057
1058 SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
1059 return true;
1060 }
1061
1062 //////////////////////////////////////////////////////////////////////////
1063 /// @brief State 2 for RECT_LIST topology.
1064 /// Not implemented unless there is a use case for more then 8 rects.
1065 /// @param pa - State for PA state machine.
1066 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1067 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1068 bool PaRectList2(
1069 PA_STATE_OPT& pa,
1070 uint32_t slot,
1071 simdvector verts[])
1072 {
1073 SWR_ASSERT(0); // Is rect list used for anything other then clears?
1074 SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
1075 return true;
1076 }
1077
1078 #if ENABLE_AVX512_SIMD16
1079 //////////////////////////////////////////////////////////////////////////
1080 /// @brief State 1 for RECT_LIST topology.
1081 /// There is not enough to assemble 8 triangles.
1082 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1083 {
1084 SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0);
1085 return false;
1086 }
1087
1088 //////////////////////////////////////////////////////////////////////////
1089 /// @brief State 1 for RECT_LIST topology.
1090 /// Rect lists has the following format.
1091 /// w x y z
1092 /// v2 o---o v5 o---o v8 o---o v11 o---o
1093 /// | \ | | \ | | \ | | \ |
1094 /// v1 o---o v4 o---o v7 o---o v10 o---o
1095 /// v0 v3 v6 v9
1096 ///
1097 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
1098 ///
1099 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
1100 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
1101 /// etc.
1102 ///
1103 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
1104 /// where v0 contains all the first vertices for 8 triangles.
1105 ///
1106 /// Result:
1107 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
1108 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
1109 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
1110 ///
1111 /// @param pa - State for PA state machine.
1112 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1113 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1114 bool PaRectList1_simd16(
1115 PA_STATE_OPT& pa,
1116 uint32_t slot,
1117 simd16vector verts[])
1118 {
1119 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 }
1120 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
1121
1122 simdvector a;
1123 simdvector b;
1124
1125 for (uint32_t i = 0; i < 4; i += 1)
1126 {
1127 if (pa.useAlternateOffset)
1128 {
1129 a[i] = b_16[i].lo;
1130 b[i] = b_16[i].hi;
1131 }
1132 else
1133 {
1134 a[i] = a_16[i].lo;
1135 b[i] = a_16[i].hi;
1136 }
1137 }
1138
1139 __m256 tmp0, tmp1, tmp2;
1140
1141 // Loop over each component in the simdvector.
1142 for (int i = 0; i < 4; i += 1)
1143 {
1144 simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
1145 tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
1146 v0[i].lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
1147 tmp1 = _mm256_permute_ps(v0[i].lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
1148 v0[i].lo = _mm256_permute_ps(v0[i].lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
1149 v0[i].lo = _mm256_blend_ps(tmp1, v0[i].lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
1150
1151 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
1152 /// AVX2 should make this much cheaper.
1153 simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
1154 v1[i].lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
1155 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
1156 tmp2 = _mm256_blend_ps(v1[i].lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
1157 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
1158 v1[i].lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
1159 v1[i].lo = _mm256_blend_ps(tmp2, v1[i].lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
1160 v1[i].lo = _mm256_blend_ps(v1[i].lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
1161
1162 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
1163 simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
1164 v2[i].lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
1165 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
1166 v2[i].lo = _mm256_blend_ps(tmp1, v2[i].lo, 0xF0);
1167
1168 // Need to compute 4th implied vertex for the rectangle.
1169 tmp2 = _mm256_sub_ps(v0[i].lo, v1[i].lo);
1170 tmp2 = _mm256_add_ps(tmp2, v2[i].lo); // tmp2 = { w, *, x, *, y, *, z, * }
1171 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
1172 v2[i].lo = _mm256_blend_ps(v2[i].lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
1173
1174 v0[i].hi = _simd_setzero_ps();
1175 v1[i].hi = _simd_setzero_ps();
1176 v2[i].hi = _simd_setzero_ps();
1177 }
1178
1179 SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
1180 return true;
1181 }
1182
1183 //////////////////////////////////////////////////////////////////////////
1184 /// @brief State 2 for RECT_LIST topology.
1185 /// Not implemented unless there is a use case for more then 8 rects.
1186 /// @param pa - State for PA state machine.
1187 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1188 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1189 bool PaRectList2_simd16(
1190 PA_STATE_OPT& pa,
1191 uint32_t slot,
1192 simd16vector verts[])
1193 {
1194 SWR_ASSERT(0); // Is rect list used for anything other then clears?
1195 SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
1196 return true;
1197 }
1198
1199 #endif
1200 //////////////////////////////////////////////////////////////////////////
1201 /// @brief This procedure is called by the Binner to assemble the attributes.
1202 /// Unlike position, which is stored vertically, the attributes are
1203 /// stored horizontally. The outputs from the VS, labeled as 'a' and
1204 /// 'b' are vertical. This function needs to transpose the lanes
1205 /// containing the vertical attribute data into horizontal form.
1206 /// @param pa - State for PA state machine.
1207 /// @param slot - Index into VS output for a given attribute.
1208 /// @param primIndex - Binner processes each triangle individually.
1209 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1210 void PaRectListSingle0(
1211 PA_STATE_OPT& pa,
1212 uint32_t slot,
1213 uint32_t primIndex,
1214 __m128 verts[])
1215 {
1216 // We have 12 simdscalars contained within 3 simdvectors which
1217 // hold at least 8 triangles worth of data. We want to assemble a single
1218 // triangle with data in horizontal form.
1219 #if USE_SIMD16_FRONTEND
1220 const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1221 const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1222
1223 simdvector a;
1224 simdvector b;
1225
1226 for (uint32_t i = 0; i < 4; i += 1)
1227 {
1228 if (pa.useAlternateOffset)
1229 {
1230 a[i] = b_16[i].lo;
1231 b[i] = b_16[i].hi;
1232 }
1233 else
1234 {
1235 a[i] = a_16[i].lo;
1236 b[i] = a_16[i].hi;
1237 }
1238 }
1239
1240 #else
1241 simdvector& a = PaGetSimdVector(pa, 0, slot);
1242
1243 #endif
1244 // Convert from vertical to horizontal.
1245 switch(primIndex)
1246 {
1247 case 0:
1248 verts[0] = swizzleLane0(a);
1249 verts[1] = swizzleLane1(a);
1250 verts[2] = swizzleLane2(a);
1251 break;
1252 case 1:
1253 verts[0] = swizzleLane0(a);
1254 verts[1] = swizzleLane2(a);
1255 verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2);
1256 break;
1257 case 2:
1258 case 3:
1259 case 4:
1260 case 5:
1261 case 6:
1262 case 7:
1263 SWR_ASSERT(0);
1264 break;
1265 };
1266 }
1267
1268 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
1269 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
1270 cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
1271 {
1272 const API_STATE& state = GetApiState(pDC);
1273
1274 this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
1275
1276 #if ENABLE_AVX512_SIMD16
1277 pfnPaFunc_simd16 = nullptr;
1278
1279 #endif
1280 switch (this->binTopology)
1281 {
1282 case TOP_TRIANGLE_LIST:
1283 this->pfnPaFunc = PaTriList0;
1284 #if ENABLE_AVX512_SIMD16
1285 this->pfnPaFunc_simd16 = PaTriList0_simd16;
1286 #endif
1287 break;
1288 case TOP_TRIANGLE_STRIP:
1289 this->pfnPaFunc = PaTriStrip0;
1290 break;
1291 case TOP_TRIANGLE_FAN:
1292 this->pfnPaFunc = PaTriFan0;
1293 break;
1294 case TOP_QUAD_LIST:
1295 this->pfnPaFunc = PaQuadList0;
1296 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
1297 break;
1298 case TOP_QUAD_STRIP:
1299 // quad strip pattern when decomposed into triangles is the same as verts strips
1300 this->pfnPaFunc = PaTriStrip0;
1301 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
1302 break;
1303 case TOP_LINE_LIST:
1304 this->pfnPaFunc = PaLineList0;
1305 this->numPrims = in_numPrims;
1306 break;
1307 case TOP_LINE_STRIP:
1308 this->pfnPaFunc = PaLineStrip0;
1309 this->numPrims = in_numPrims;
1310 break;
1311 case TOP_LINE_LOOP:
1312 this->pfnPaFunc = PaLineLoop0;
1313 this->numPrims = in_numPrims;
1314 break;
1315 case TOP_POINT_LIST:
1316 // use point binner and rasterizer if supported
1317 this->pfnPaFunc = PaPoints0;
1318 this->numPrims = in_numPrims;
1319 break;
1320 case TOP_RECT_LIST:
1321 this->pfnPaFunc = PaRectList0;
1322 #if ENABLE_AVX512_SIMD16
1323 this->pfnPaFunc_simd16 = PaRectList0_simd16;
1324 #endif
1325 this->numPrims = in_numPrims * 2;
1326 break;
1327
1328 case TOP_PATCHLIST_1:
1329 this->pfnPaFunc = PaPatchList<1>;
1330 break;
1331 case TOP_PATCHLIST_2:
1332 this->pfnPaFunc = PaPatchList<2>;
1333 break;
1334 case TOP_PATCHLIST_3:
1335 this->pfnPaFunc = PaPatchList<3>;
1336 break;
1337 case TOP_PATCHLIST_4:
1338 this->pfnPaFunc = PaPatchList<4>;
1339 break;
1340 case TOP_PATCHLIST_5:
1341 this->pfnPaFunc = PaPatchList<5>;
1342 break;
1343 case TOP_PATCHLIST_6:
1344 this->pfnPaFunc = PaPatchList<6>;
1345 break;
1346 case TOP_PATCHLIST_7:
1347 this->pfnPaFunc = PaPatchList<7>;
1348 break;
1349 case TOP_PATCHLIST_8:
1350 this->pfnPaFunc = PaPatchList<8>;
1351 break;
1352 case TOP_PATCHLIST_9:
1353 this->pfnPaFunc = PaPatchList<9>;
1354 break;
1355 case TOP_PATCHLIST_10:
1356 this->pfnPaFunc = PaPatchList<10>;
1357 break;
1358 case TOP_PATCHLIST_11:
1359 this->pfnPaFunc = PaPatchList<11>;
1360 break;
1361 case TOP_PATCHLIST_12:
1362 this->pfnPaFunc = PaPatchList<12>;
1363 break;
1364 case TOP_PATCHLIST_13:
1365 this->pfnPaFunc = PaPatchList<13>;
1366 break;
1367 case TOP_PATCHLIST_14:
1368 this->pfnPaFunc = PaPatchList<14>;
1369 break;
1370 case TOP_PATCHLIST_15:
1371 this->pfnPaFunc = PaPatchList<15>;
1372 break;
1373 case TOP_PATCHLIST_16:
1374 this->pfnPaFunc = PaPatchList<16>;
1375 break;
1376 case TOP_PATCHLIST_17:
1377 this->pfnPaFunc = PaPatchList<17>;
1378 break;
1379 case TOP_PATCHLIST_18:
1380 this->pfnPaFunc = PaPatchList<18>;
1381 break;
1382 case TOP_PATCHLIST_19:
1383 this->pfnPaFunc = PaPatchList<19>;
1384 break;
1385 case TOP_PATCHLIST_20:
1386 this->pfnPaFunc = PaPatchList<20>;
1387 break;
1388 case TOP_PATCHLIST_21:
1389 this->pfnPaFunc = PaPatchList<21>;
1390 break;
1391 case TOP_PATCHLIST_22:
1392 this->pfnPaFunc = PaPatchList<22>;
1393 break;
1394 case TOP_PATCHLIST_23:
1395 this->pfnPaFunc = PaPatchList<23>;
1396 break;
1397 case TOP_PATCHLIST_24:
1398 this->pfnPaFunc = PaPatchList<24>;
1399 break;
1400 case TOP_PATCHLIST_25:
1401 this->pfnPaFunc = PaPatchList<25>;
1402 break;
1403 case TOP_PATCHLIST_26:
1404 this->pfnPaFunc = PaPatchList<26>;
1405 break;
1406 case TOP_PATCHLIST_27:
1407 this->pfnPaFunc = PaPatchList<27>;
1408 break;
1409 case TOP_PATCHLIST_28:
1410 this->pfnPaFunc = PaPatchList<28>;
1411 break;
1412 case TOP_PATCHLIST_29:
1413 this->pfnPaFunc = PaPatchList<29>;
1414 break;
1415 case TOP_PATCHLIST_30:
1416 this->pfnPaFunc = PaPatchList<30>;
1417 break;
1418 case TOP_PATCHLIST_31:
1419 this->pfnPaFunc = PaPatchList<31>;
1420 break;
1421 case TOP_PATCHLIST_32:
1422 this->pfnPaFunc = PaPatchList<32>;
1423 break;
1424
1425 default:
1426 SWR_ASSERT(0);
1427 break;
1428 };
1429
1430 this->pfnPaFuncReset = this->pfnPaFunc;
1431 #if ENABLE_AVX512_SIMD16
1432 this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
1433 #endif
1434
1435 #if USE_SIMD16_FRONTEND
1436 simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1437 simd16scalari id82 = _simd16_set_epi32( 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1438
1439 #else
1440 simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1441 simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
1442
1443 #endif
1444 switch(this->binTopology)
1445 {
1446 case TOP_TRIANGLE_LIST:
1447 case TOP_TRIANGLE_STRIP:
1448 case TOP_TRIANGLE_FAN:
1449 case TOP_LINE_STRIP:
1450 case TOP_LINE_LIST:
1451 case TOP_LINE_LOOP:
1452 #if USE_SIMD16_FRONTEND
1453 this->primIDIncr = 16;
1454 this->primID = id16;
1455 #else
1456 this->primIDIncr = 8;
1457 this->primID = id8;
1458 #endif
1459 break;
1460 case TOP_QUAD_LIST:
1461 case TOP_QUAD_STRIP:
1462 case TOP_RECT_LIST:
1463 #if USE_SIMD16_FRONTEND
1464 this->primIDIncr = 8;
1465 this->primID = id82;
1466 #else
1467 this->primIDIncr = 4;
1468 this->primID = id4;
1469 #endif
1470 break;
1471 case TOP_POINT_LIST:
1472 #if USE_SIMD16_FRONTEND
1473 this->primIDIncr = 16;
1474 this->primID = id16;
1475 #else
1476 this->primIDIncr = 8;
1477 this->primID = id8;
1478 #endif
1479 break;
1480 case TOP_PATCHLIST_1:
1481 case TOP_PATCHLIST_2:
1482 case TOP_PATCHLIST_3:
1483 case TOP_PATCHLIST_4:
1484 case TOP_PATCHLIST_5:
1485 case TOP_PATCHLIST_6:
1486 case TOP_PATCHLIST_7:
1487 case TOP_PATCHLIST_8:
1488 case TOP_PATCHLIST_9:
1489 case TOP_PATCHLIST_10:
1490 case TOP_PATCHLIST_11:
1491 case TOP_PATCHLIST_12:
1492 case TOP_PATCHLIST_13:
1493 case TOP_PATCHLIST_14:
1494 case TOP_PATCHLIST_15:
1495 case TOP_PATCHLIST_16:
1496 case TOP_PATCHLIST_17:
1497 case TOP_PATCHLIST_18:
1498 case TOP_PATCHLIST_19:
1499 case TOP_PATCHLIST_20:
1500 case TOP_PATCHLIST_21:
1501 case TOP_PATCHLIST_22:
1502 case TOP_PATCHLIST_23:
1503 case TOP_PATCHLIST_24:
1504 case TOP_PATCHLIST_25:
1505 case TOP_PATCHLIST_26:
1506 case TOP_PATCHLIST_27:
1507 case TOP_PATCHLIST_28:
1508 case TOP_PATCHLIST_29:
1509 case TOP_PATCHLIST_30:
1510 case TOP_PATCHLIST_31:
1511 case TOP_PATCHLIST_32:
1512 // Always run KNOB_SIMD_WIDTH number of patches at a time.
1513 #if USE_SIMD16_FRONTEND
1514 this->primIDIncr = 16;
1515 this->primID = id16;
1516 #else
1517 this->primIDIncr = 8;
1518 this->primID = id8;
1519 #endif
1520 break;
1521
1522 default:
1523 SWR_ASSERT(0);
1524 break;
1525 };
1526
1527 }
1528 #endif