a8d8379297ef5d7511b3cb7e8d3f546be7fcdb7a
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
35 #if (KNOB_SIMD_WIDTH == 8)
37 bool PaTriList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
38 bool PaTriList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
39 bool PaTriList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
40 #if ENABLE_AVX512_SIMD16
41 bool PaTriList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
42 bool PaTriList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
43 bool PaTriList2_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
45 void PaTriListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
47 bool PaTriStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
48 bool PaTriStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
49 void PaTriStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
51 bool PaTriFan0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
52 bool PaTriFan1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
53 void PaTriFanSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
55 bool PaQuadList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
56 bool PaQuadList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
57 void PaQuadListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
59 bool PaLineLoop0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
60 bool PaLineLoop1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
62 bool PaLineList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
63 bool PaLineList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
64 void PaLineListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t index
, __m128 verts
[]);
66 bool PaLineStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
67 bool PaLineStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
68 void PaLineStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 lineverts
[]);
70 bool PaPoints0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
71 void PaPointsSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
73 bool PaRectList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
74 bool PaRectList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
75 bool PaRectList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
76 #if ENABLE_AVX512_SIMD16
77 bool PaRectList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
78 bool PaRectList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
79 bool PaRectList2_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
81 void PaRectListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
83 template <uint32_t TotalControlPoints
>
84 void PaPatchListSingle(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
86 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
87 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
88 // Each attribute has 4 components.
90 /// @todo Optimize this
92 float* pOutVec
= (float*)verts
;
94 for (uint32_t cp
= 0; cp
< TotalControlPoints
; ++cp
)
96 uint32_t input_cp
= primIndex
* TotalControlPoints
+ cp
;
97 uint32_t input_vec
= input_cp
/ KNOB_SIMD_WIDTH
;
98 uint32_t input_lane
= input_cp
% KNOB_SIMD_WIDTH
;
100 // Loop over all components of the attribute
101 for (uint32_t i
= 0; i
< 4; ++i
)
103 const float* pInputVec
= (const float*)(&PaGetSimdVector(pa
, input_vec
, slot
)[i
]);
104 pOutVec
[cp
* 4 + i
] = pInputVec
[input_lane
];
109 template<uint32_t TotalControlPoints
, uint32_t CurrentControlPoints
= 1>
110 static bool PaPatchList(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
114 PaPatchList
<TotalControlPoints
, CurrentControlPoints
+ 1>,
115 PaPatchListSingle
<TotalControlPoints
>);
120 template<uint32_t TotalControlPoints
>
121 static bool PaPatchListTerm(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
123 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
124 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
125 // Each attribute has 4 components.
127 /// @todo Optimize this
129 // Loop over all components of the attribute
130 for (uint32_t i
= 0; i
< 4; ++i
)
132 for (uint32_t cp
= 0; cp
< TotalControlPoints
; ++cp
)
134 float vec
[KNOB_SIMD_WIDTH
];
135 for (uint32_t lane
= 0; lane
< KNOB_SIMD_WIDTH
; ++lane
)
137 uint32_t input_cp
= lane
* TotalControlPoints
+ cp
;
138 uint32_t input_vec
= input_cp
/ KNOB_SIMD_WIDTH
;
139 uint32_t input_lane
= input_cp
% KNOB_SIMD_WIDTH
;
141 const float* pInputVec
= (const float*)(&PaGetSimdVector(pa
, input_vec
, slot
)[i
]);
142 vec
[lane
] = pInputVec
[input_lane
];
144 verts
[cp
][i
] = _simd_loadu_ps(vec
);
150 PaPatchList
<TotalControlPoints
>,
151 PaPatchListSingle
<TotalControlPoints
>,
159 #define PA_PATCH_LIST_TERMINATOR(N) \
160 template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
161 { return PaPatchListTerm<N>(pa, slot, verts); }
162 PA_PATCH_LIST_TERMINATOR(1)
163 PA_PATCH_LIST_TERMINATOR(2)
164 PA_PATCH_LIST_TERMINATOR(3)
165 PA_PATCH_LIST_TERMINATOR(4)
166 PA_PATCH_LIST_TERMINATOR(5)
167 PA_PATCH_LIST_TERMINATOR(6)
168 PA_PATCH_LIST_TERMINATOR(7)
169 PA_PATCH_LIST_TERMINATOR(8)
170 PA_PATCH_LIST_TERMINATOR(9)
171 PA_PATCH_LIST_TERMINATOR(10)
172 PA_PATCH_LIST_TERMINATOR(11)
173 PA_PATCH_LIST_TERMINATOR(12)
174 PA_PATCH_LIST_TERMINATOR(13)
175 PA_PATCH_LIST_TERMINATOR(14)
176 PA_PATCH_LIST_TERMINATOR(15)
177 PA_PATCH_LIST_TERMINATOR(16)
178 PA_PATCH_LIST_TERMINATOR(17)
179 PA_PATCH_LIST_TERMINATOR(18)
180 PA_PATCH_LIST_TERMINATOR(19)
181 PA_PATCH_LIST_TERMINATOR(20)
182 PA_PATCH_LIST_TERMINATOR(21)
183 PA_PATCH_LIST_TERMINATOR(22)
184 PA_PATCH_LIST_TERMINATOR(23)
185 PA_PATCH_LIST_TERMINATOR(24)
186 PA_PATCH_LIST_TERMINATOR(25)
187 PA_PATCH_LIST_TERMINATOR(26)
188 PA_PATCH_LIST_TERMINATOR(27)
189 PA_PATCH_LIST_TERMINATOR(28)
190 PA_PATCH_LIST_TERMINATOR(29)
191 PA_PATCH_LIST_TERMINATOR(30)
192 PA_PATCH_LIST_TERMINATOR(31)
193 PA_PATCH_LIST_TERMINATOR(32)
194 #undef PA_PATCH_LIST_TERMINATOR
196 bool PaTriList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
198 SetNextPaState(pa
, PaTriList1
, PaTriListSingle0
);
199 return false; // Not enough vertices to assemble 4 or 8 triangles.
202 bool PaTriList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
204 SetNextPaState(pa
, PaTriList2
, PaTriListSingle0
);
205 return false; // Not enough vertices to assemble 8 triangles.
208 bool PaTriList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
210 #if KNOB_ARCH == KNOB_ARCH_AVX
212 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
213 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
214 simdvector
& c
= PaGetSimdVector(pa
, 2, slot
);
217 // Tri Pattern - provoking vertex is always v0
218 // v0 -> 0 3 6 9 12 15 18 21
219 // v1 -> 1 4 7 10 13 16 19 22
220 // v2 -> 2 5 8 11 14 17 20 23
222 for (int i
= 0; i
< 4; ++i
)
224 simdvector
& v0
= verts
[0];
225 v0
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x92);
226 v0
[i
] = _simd_blend_ps(v0
[i
], c
[i
], 0x24);
227 v0
[i
] = _mm256_permute_ps(v0
[i
], 0x6C);
228 s
= _mm256_permute2f128_ps(v0
[i
], v0
[i
], 0x21);
229 v0
[i
] = _simd_blend_ps(v0
[i
], s
, 0x44);
231 simdvector
& v1
= verts
[1];
232 v1
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x24);
233 v1
[i
] = _simd_blend_ps(v1
[i
], c
[i
], 0x49);
234 v1
[i
] = _mm256_permute_ps(v1
[i
], 0xB1);
235 s
= _mm256_permute2f128_ps(v1
[i
], v1
[i
], 0x21);
236 v1
[i
] = _simd_blend_ps(v1
[i
], s
, 0x66);
238 simdvector
& v2
= verts
[2];
239 v2
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x49);
240 v2
[i
] = _simd_blend_ps(v2
[i
], c
[i
], 0x92);
241 v2
[i
] = _mm256_permute_ps(v2
[i
], 0xC6);
242 s
= _mm256_permute2f128_ps(v2
[i
], v2
[i
], 0x21);
243 v2
[i
] = _simd_blend_ps(v2
[i
], s
, 0x22);
246 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
248 const simdscalari perm0
= _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
249 const simdscalari perm1
= _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
250 const simdscalari perm2
= _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
252 const simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
253 const simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
254 const simdvector
&c
= PaGetSimdVector(pa
, 2, slot
);
256 // v0 -> a0 a3 a6 b1 b4 b7 c2 c5
257 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6
258 // v2 -> a2 a5 b0 b3 b6 c1 c4 c7
260 simdvector
&v0
= verts
[0];
261 simdvector
&v1
= verts
[1];
262 simdvector
&v2
= verts
[2];
264 // for simd x, y, z, and w
265 for (int i
= 0; i
< 4; ++i
)
267 v0
[i
] = _simd_blend_ps(_simd_blend_ps(a
[i
], b
[i
], 0x92), c
[i
], 0x24);
268 v0
[i
] = _simd_permute_ps(v0
[i
], perm0
);
270 v1
[i
] = _simd_blend_ps(_simd_blend_ps(a
[i
], b
[i
], 0x24), c
[i
], 0x49);
271 v1
[i
] = _simd_permute_ps(v1
[i
], perm1
);
273 v2
[i
] = _simd_blend_ps(_simd_blend_ps(a
[i
], b
[i
], 0x49), c
[i
], 0x92);
274 v2
[i
] = _simd_permute_ps(v2
[i
], perm2
);
279 SetNextPaState(pa
, PaTriList0
, PaTriListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
283 #if ENABLE_AVX512_SIMD16
284 bool PaTriList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
286 SetNextPaState_simd16(pa
, PaTriList1_simd16
, PaTriListSingle0
);
287 return false; // Not enough vertices to assemble 16 triangles
290 bool PaTriList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
292 SetNextPaState_simd16(pa
, PaTriList2_simd16
, PaTriListSingle0
);
293 return false; // Not enough vertices to assemble 16 triangles
296 bool PaTriList2_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
298 const simd16scalari perm0
= _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0);
299 const simd16scalari perm1
= _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1);
300 const simd16scalari perm2
= _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2);
302 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
303 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, 1, slot
);
304 const simd16vector
&c
= PaGetSimdVector_simd16(pa
, 2, slot
);
306 simd16vector
&v0
= verts
[0];
307 simd16vector
&v1
= verts
[1];
308 simd16vector
&v2
= verts
[2];
310 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
311 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
312 // v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
314 // for simd16 x, y, z, and w
315 for (int i
= 0; i
< 4; i
+= 1)
317 v0
[i
] = _simd16_blend_ps(_simd16_blend_ps(a
[i
], b
[i
], 0x4924), c
[i
], 0x2492);
318 v0
[i
] = _simd16_permute_ps(v0
[i
], perm0
);
320 v1
[i
] = _simd16_blend_ps(_simd16_blend_ps(a
[i
], b
[i
], 0x9249), c
[i
], 0x4924);
321 v1
[i
] = _simd16_permute_ps(v1
[i
], perm1
);
323 v2
[i
] = _simd16_blend_ps(_simd16_blend_ps(a
[i
], b
[i
], 0x2492), c
[i
], 0x9249);
324 v2
[i
] = _simd16_permute_ps(v2
[i
], perm2
);
327 SetNextPaState_simd16(pa
, PaTriList0_simd16
, PaTriListSingle0
, 0, KNOB_SIMD16_WIDTH
, true);
332 void PaTriListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
334 // We have 12 simdscalars contained within 3 simdvectors which
335 // hold at least 8 triangles worth of data. We want to assemble a single
336 // triangle with data in horizontal form.
337 #if USE_SIMD16_FRONTEND
338 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
339 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
340 const simd16vector
&c_16
= PaGetSimdVector_simd16(pa
, 2, slot
);
346 for (uint32_t i
= 0; i
< 4; i
+= 1)
348 if (pa
.useAlternateOffset
)
363 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
364 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
365 simdvector
& c
= PaGetSimdVector(pa
, 2, slot
);
368 // Convert from vertical to horizontal.
369 // Tri Pattern - provoking vertex is always v0
370 // v0 -> 0 3 6 9 12 15 18 21
371 // v1 -> 1 4 7 10 13 16 19 22
372 // v2 -> 2 5 8 11 14 17 20 23
376 verts
[0] = swizzleLane0(a
);
377 verts
[1] = swizzleLane1(a
);
378 verts
[2] = swizzleLane2(a
);
381 verts
[0] = swizzleLane3(a
);
382 verts
[1] = swizzleLane4(a
);
383 verts
[2] = swizzleLane5(a
);
386 verts
[0] = swizzleLane6(a
);
387 verts
[1] = swizzleLane7(a
);
388 verts
[2] = swizzleLane0(b
);
391 verts
[0] = swizzleLane1(b
);
392 verts
[1] = swizzleLane2(b
);
393 verts
[2] = swizzleLane3(b
);
396 verts
[0] = swizzleLane4(b
);
397 verts
[1] = swizzleLane5(b
);
398 verts
[2] = swizzleLane6(b
);
401 verts
[0] = swizzleLane7(b
);
402 verts
[1] = swizzleLane0(c
);
403 verts
[2] = swizzleLane1(c
);
406 verts
[0] = swizzleLane2(c
);
407 verts
[1] = swizzleLane3(c
);
408 verts
[2] = swizzleLane4(c
);
411 verts
[0] = swizzleLane5(c
);
412 verts
[1] = swizzleLane6(c
);
413 verts
[2] = swizzleLane7(c
);
418 bool PaTriStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
420 SetNextPaState(pa
, PaTriStrip1
, PaTriStripSingle0
);
421 return false; // Not enough vertices to assemble 8 triangles.
424 bool PaTriStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
426 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
427 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
430 for(int i
= 0; i
< 4; ++i
)
432 simdscalar a0
= a
[i
];
433 simdscalar b0
= b
[i
];
435 // Tri Pattern - provoking vertex is always v0
439 simdvector
& v0
= verts
[0];
443 s
= _mm256_permute2f128_ps(a0
, b0
, 0x21);
445 s
= _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(1, 0, 3, 2));
447 simdvector
& v1
= verts
[1];
449 v1
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(3, 1, 3, 1));
451 simdvector
& v2
= verts
[2];
453 v2
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(2, 2, 2, 2));
456 SetNextPaState(pa
, PaTriStrip1
, PaTriStripSingle0
, 0, KNOB_SIMD_WIDTH
);
460 #if 0 // ENABLE_AVX512_SIMD16
461 bool PaTriStrip1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
463 const simd16vector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
464 const simd16vector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
466 simd16vector
&v0
= verts
[0];
467 simd16vector
&v1
= verts
[1];
468 simd16vector
&v2
= verts
[2];
470 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
471 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
472 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
474 // for simd16 x, y, z, and w
475 for (int i
= 0; i
< 4; i
+= 1)
477 simd16scalar perm0
= _simd16_permute2f128_ps(a
[i
], a
[i
], 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
478 simd16scalar perm1
= _simd16_permute2f128_ps(b
[i
], b
[i
], 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
480 simd16scalar blend
= _simd16_blend_ps(perm0
, perm1
, 0xF000); // // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
481 simd16scalar shuff
= _simd16_shuffle_ps(a
[i
], blend
, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
483 v0
[i
] = a
[i
]; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
484 v1
[i
] = _simd16_shuffle_ps(a
[i
], shuff
, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
485 v2
[i
] = _simd16_shuffle_ps(a
[i
], shuff
, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
488 SetNextPaState(pa
, PaTriStrip1
, PaTriStripSingle0
, 0, KNOB_SIMD16_WIDTH
);
493 void PaTriStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
495 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
496 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
498 // Convert from vertical to horizontal.
499 // Tri Pattern - provoking vertex is always v0
506 verts
[0] = swizzleLane0(a
);
507 verts
[1] = swizzleLane1(a
);
508 verts
[2] = swizzleLane2(a
);
511 verts
[0] = swizzleLane1(a
);
512 verts
[1] = swizzleLane3(a
);
513 verts
[2] = swizzleLane2(a
);
516 verts
[0] = swizzleLane2(a
);
517 verts
[1] = swizzleLane3(a
);
518 verts
[2] = swizzleLane4(a
);
521 verts
[0] = swizzleLane3(a
);
522 verts
[1] = swizzleLane5(a
);
523 verts
[2] = swizzleLane4(a
);
526 verts
[0] = swizzleLane4(a
);
527 verts
[1] = swizzleLane5(a
);
528 verts
[2] = swizzleLane6(a
);
531 verts
[0] = swizzleLane5(a
);
532 verts
[1] = swizzleLane7(a
);
533 verts
[2] = swizzleLane6(a
);
536 verts
[0] = swizzleLane6(a
);
537 verts
[1] = swizzleLane7(a
);
538 verts
[2] = swizzleLane0(b
);
541 verts
[0] = swizzleLane7(a
);
542 verts
[1] = swizzleLane1(b
);
543 verts
[2] = swizzleLane0(b
);
548 bool PaTriFan0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
550 simdvector
& a
= PaGetSimdVector(pa
, pa
.cur
, slot
);
552 // Extract vertex 0 to every lane of first vector
553 for(int i
= 0; i
< 4; ++i
)
556 simdvector
& v0
= verts
[0];
557 v0
[i
] = _simd_shuffle_ps(a0
, a0
, _MM_SHUFFLE(0, 0, 0, 0));
558 v0
[i
] = _mm256_permute2f128_ps(v0
[i
], a0
, 0x00);
561 // store off leading vertex for attributes
562 PA_STATE_OPT::SIMDVERTEX
* pVertex
= (PA_STATE_OPT::SIMDVERTEX
*)pa
.pStreamBase
;
563 pa
.leadingVertex
= pVertex
[pa
.cur
];
565 SetNextPaState(pa
, PaTriFan1
, PaTriFanSingle0
);
566 return false; // Not enough vertices to assemble 8 triangles.
569 bool PaTriFan1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
571 PA_STATE_OPT::SIMDVECTOR
& leadVert
= pa
.leadingVertex
.attrib
[slot
];
572 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
573 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
576 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
577 for(int i
= 0; i
< 4; ++i
)
579 simdscalar a0
= a
[i
];
580 simdscalar b0
= b
[i
];
582 #if USE_SIMD16_FRONTEND
583 __m256 comp
= leadVert
[i
].lo
;
585 __m256 comp
= leadVert
[i
];
587 simdvector
& v0
= verts
[0];
588 v0
[i
] = _simd_shuffle_ps(comp
, comp
, _MM_SHUFFLE(0, 0, 0, 0));
589 v0
[i
] = _mm256_permute2f128_ps(v0
[i
], comp
, 0x00);
591 simdvector
& v2
= verts
[2];
592 s
= _mm256_permute2f128_ps(a0
, b0
, 0x21);
593 v2
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(1, 0, 3, 2));
595 simdvector
& v1
= verts
[1];
596 v1
[i
] = _simd_shuffle_ps(a0
, v2
[i
], _MM_SHUFFLE(2, 1, 2, 1));
599 SetNextPaState(pa
, PaTriFan1
, PaTriFanSingle0
, 0, KNOB_SIMD_WIDTH
);
603 void PaTriFanSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
605 // vert 0 from leading vertex
606 #if USE_SIMD16_FRONTEND
607 PA_STATE_OPT::SIMDVECTOR
& temp
= pa
.leadingVertex
.attrib
[slot
];
610 lead
[0] = temp
[0].lo
;
611 lead
[1] = temp
[1].lo
;
612 lead
[2] = temp
[2].lo
;
613 lead
[3] = temp
[3].lo
;
614 verts
[0] = swizzleLane0(lead
);
616 PA_STATE_OPT::SIMDVECTOR
& lead
= pa
.leadingVertex
.attrib
[slot
];
617 verts
[0] = swizzleLane0(lead
);
620 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
621 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
626 verts
[1] = swizzleLaneN(a
, primIndex
+ 1);
630 verts
[1] = swizzleLane0(b
);
636 verts
[2] = swizzleLaneN(a
, primIndex
+ 2);
640 verts
[2] = swizzleLaneN(b
, primIndex
- 6);
644 bool PaQuadList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
646 SetNextPaState(pa
, PaQuadList1
, PaQuadListSingle0
);
647 return false; // Not enough vertices to assemble 8 triangles.
650 bool PaQuadList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
652 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
653 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
656 for(int i
= 0; i
< 4; ++i
)
658 simdscalar a0
= a
[i
];
659 simdscalar b0
= b
[i
];
661 s1
= _mm256_permute2f128_ps(a0
, b0
, 0x20);
662 s2
= _mm256_permute2f128_ps(a0
, b0
, 0x31);
664 simdvector
& v0
= verts
[0];
665 v0
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(0, 0, 0, 0));
667 simdvector
& v1
= verts
[1];
668 v1
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(2, 1, 2, 1));
670 simdvector
& v2
= verts
[2];
671 v2
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(3, 2, 3, 2));
674 SetNextPaState(pa
, PaQuadList0
, PaQuadListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
678 void PaQuadListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
680 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
681 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
686 // triangle 0 - 0 1 2
687 verts
[0] = swizzleLane0(a
);
688 verts
[1] = swizzleLane1(a
);
689 verts
[2] = swizzleLane2(a
);
693 // triangle 1 - 0 2 3
694 verts
[0] = swizzleLane0(a
);
695 verts
[1] = swizzleLane2(a
);
696 verts
[2] = swizzleLane3(a
);
700 // triangle 2 - 4 5 6
701 verts
[0] = swizzleLane4(a
);
702 verts
[1] = swizzleLane5(a
);
703 verts
[2] = swizzleLane6(a
);
707 // triangle 3 - 4 6 7
708 verts
[0] = swizzleLane4(a
);
709 verts
[1] = swizzleLane6(a
);
710 verts
[2] = swizzleLane7(a
);
714 // triangle 4 - 8 9 10 (0 1 2)
715 verts
[0] = swizzleLane0(b
);
716 verts
[1] = swizzleLane1(b
);
717 verts
[2] = swizzleLane2(b
);
721 // triangle 1 - 0 2 3
722 verts
[0] = swizzleLane0(b
);
723 verts
[1] = swizzleLane2(b
);
724 verts
[2] = swizzleLane3(b
);
728 // triangle 2 - 4 5 6
729 verts
[0] = swizzleLane4(b
);
730 verts
[1] = swizzleLane5(b
);
731 verts
[2] = swizzleLane6(b
);
735 // triangle 3 - 4 6 7
736 verts
[0] = swizzleLane4(b
);
737 verts
[1] = swizzleLane6(b
);
738 verts
[2] = swizzleLane7(b
);
743 void PaLineLoopSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t lineIndex
, __m128 verts
[])
745 PaLineStripSingle0(pa
, slot
, lineIndex
, verts
);
747 if (pa
.numPrimsComplete
+ lineIndex
== pa
.numPrims
- 1) {
748 simdvector
&start
= PaGetSimdVector(pa
, pa
.first
, slot
);
749 verts
[1] = swizzleLane0(start
);
753 bool PaLineLoop0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
755 SetNextPaState(pa
, PaLineLoop1
, PaLineLoopSingle0
);
759 bool PaLineLoop1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
761 PaLineStrip1(pa
, slot
, verts
);
763 if (pa
.numPrimsComplete
+ KNOB_SIMD_WIDTH
> pa
.numPrims
- 1) {
764 // loop reconnect now
765 int lane
= pa
.numPrims
- pa
.numPrimsComplete
- 1;
766 simdvector
&start
= PaGetSimdVector(pa
, pa
.first
, slot
);
767 for (int i
= 0; i
< 4; i
++) {
768 float *startVtx
= (float *)&(start
[i
]);
769 float *targetVtx
= (float *)&(verts
[1][i
]);
770 targetVtx
[lane
] = startVtx
[0];
774 SetNextPaState(pa
, PaLineLoop1
, PaLineLoopSingle0
, 0, KNOB_SIMD_WIDTH
);
779 bool PaLineList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
781 SetNextPaState(pa
, PaLineList1
, PaLineListSingle0
);
782 return false; // Not enough vertices to assemble 8 lines
785 bool PaLineList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
787 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
788 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
789 /// @todo: verify provoking vertex is correct
790 // Line list 0 1 2 3 4 5 6 7
791 // 8 9 10 11 12 13 14 15
794 // 0 2 4 6 8 10 12 14
795 // 1 3 5 7 9 11 13 15
797 for (uint32_t i
= 0; i
< 4; ++i
)
800 __m256 vALowBLow
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x20);
801 // 4 5 6 7 12 13 14 15
802 __m256 vAHighBHigh
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x31);
804 // 0 2 4 6 8 10 12 14
805 verts
[0].v
[i
] = _mm256_shuffle_ps(vALowBLow
, vAHighBHigh
, _MM_SHUFFLE(2, 0, 2, 0));
806 // 1 3 5 7 9 11 13 15
807 verts
[1].v
[i
] = _mm256_shuffle_ps(vALowBLow
, vAHighBHigh
, _MM_SHUFFLE(3, 1, 3, 1));
810 SetNextPaState(pa
, PaLineList0
, PaLineListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
814 void PaLineListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
816 simdvector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
817 simdvector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
822 verts
[0] = swizzleLane0(a
);
823 verts
[1] = swizzleLane1(a
);
826 verts
[0] = swizzleLane2(a
);
827 verts
[1] = swizzleLane3(a
);
830 verts
[0] = swizzleLane4(a
);
831 verts
[1] = swizzleLane5(a
);
834 verts
[0] = swizzleLane6(a
);
835 verts
[1] = swizzleLane7(a
);
838 verts
[0] = swizzleLane0(b
);
839 verts
[1] = swizzleLane1(b
);
842 verts
[0] = swizzleLane2(b
);
843 verts
[1] = swizzleLane3(b
);
846 verts
[0] = swizzleLane4(b
);
847 verts
[1] = swizzleLane5(b
);
850 verts
[0] = swizzleLane6(b
);
851 verts
[1] = swizzleLane7(b
);
856 bool PaLineStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
858 SetNextPaState(pa
, PaLineStrip1
, PaLineStripSingle0
);
859 return false; // Not enough vertices to assemble 8 lines
862 bool PaLineStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
864 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
865 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
867 /// @todo: verify provoking vertex is correct
868 // Line list 0 1 2 3 4 5 6 7
869 // 8 9 10 11 12 13 14 15
877 for(uint32_t i
= 0; i
< 4; ++i
)
880 __m256 vPermA
= _mm256_permute_ps(a
.v
[i
], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
882 __m256 vAHighBLow
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x21);
885 __m256 vPermB
= _mm256_permute_ps(vAHighBLow
, 0); // indices hi->low (0 0 0 0)
887 verts
[1].v
[i
] = _mm256_blend_ps(vPermA
, vPermB
, 0x88);
890 SetNextPaState(pa
, PaLineStrip1
, PaLineStripSingle0
, 0, KNOB_SIMD_WIDTH
);
894 void PaLineStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t lineIndex
, __m128 verts
[])
896 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
897 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
902 verts
[0] = swizzleLane0(a
);
903 verts
[1] = swizzleLane1(a
);
906 verts
[0] = swizzleLane1(a
);
907 verts
[1] = swizzleLane2(a
);
910 verts
[0] = swizzleLane2(a
);
911 verts
[1] = swizzleLane3(a
);
914 verts
[0] = swizzleLane3(a
);
915 verts
[1] = swizzleLane4(a
);
918 verts
[0] = swizzleLane4(a
);
919 verts
[1] = swizzleLane5(a
);
922 verts
[0] = swizzleLane5(a
);
923 verts
[1] = swizzleLane6(a
);
926 verts
[0] = swizzleLane6(a
);
927 verts
[1] = swizzleLane7(a
);
930 verts
[0] = swizzleLane7(a
);
931 verts
[1] = swizzleLane0(b
);
936 bool PaPoints0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
938 simdvector
& a
= PaGetSimdVector(pa
, pa
.cur
, slot
);
940 verts
[0] = a
; // points only have 1 vertex.
942 SetNextPaState(pa
, PaPoints0
, PaPointsSingle0
, 0, KNOB_SIMD_WIDTH
, true);
946 void PaPointsSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
948 simdvector
&a
= PaGetSimdVector(pa
, pa
.cur
, slot
);
952 verts
[0] = swizzleLane0(a
);
955 verts
[0] = swizzleLane1(a
);
958 verts
[0] = swizzleLane2(a
);
961 verts
[0] = swizzleLane3(a
);
964 verts
[0] = swizzleLane4(a
);
967 verts
[0] = swizzleLane5(a
);
970 verts
[0] = swizzleLane6(a
);
973 verts
[0] = swizzleLane7(a
);
978 //////////////////////////////////////////////////////////////////////////
979 /// @brief State 1 for RECT_LIST topology.
980 /// There is not enough to assemble 8 triangles.
981 bool PaRectList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
983 SetNextPaState(pa
, PaRectList1
, PaRectListSingle0
);
987 //////////////////////////////////////////////////////////////////////////
988 /// @brief State 1 for RECT_LIST topology.
989 /// Rect lists has the following format.
991 /// v2 o---o v5 o---o v8 o---o v11 o---o
992 /// | \ | | \ | | \ | | \ |
993 /// v1 o---o v4 o---o v7 o---o v10 o---o
996 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
998 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
999 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
1002 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
1003 /// where v0 contains all the first vertices for 8 triangles.
1006 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
1007 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
1008 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
1010 /// @param pa - State for PA state machine.
1011 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1012 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1018 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
1019 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
1020 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
1022 __m256 tmp0
, tmp1
, tmp2
;
1024 // Loop over each component in the simdvector.
1025 for(int i
= 0; i
< 4; ++i
)
1027 simdvector
& v0
= verts
[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
1028 tmp0
= _mm256_permute2f128_ps(b
[i
], b
[i
], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
1029 v0
[i
] = _mm256_blend_ps(a
[i
], tmp0
, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
1030 tmp1
= _mm256_permute_ps(v0
[i
], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
1031 v0
[i
] = _mm256_permute_ps(v0
[i
], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
1032 v0
[i
] = _mm256_blend_ps(tmp1
, v0
[i
], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
1034 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
1035 /// AVX2 should make this much cheaper.
1036 simdvector
& v1
= verts
[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
1037 v1
[i
] = _mm256_permute_ps(a
[i
], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
1038 tmp1
= _mm256_permute_ps(a
[i
], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
1039 tmp2
= _mm256_blend_ps(v1
[i
], tmp1
, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
1040 tmp1
= _mm256_permute2f128_ps(tmp2
, tmp2
, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
1041 v1
[i
] = _mm256_permute_ps(tmp0
, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
1042 v1
[i
] = _mm256_blend_ps(tmp2
, v1
[i
], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
1043 v1
[i
] = _mm256_blend_ps(v1
[i
], tmp1
, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
1045 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
1046 simdvector
& v2
= verts
[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
1047 v2
[i
] = _mm256_permute_ps(tmp0
, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
1048 tmp1
= _mm256_permute_ps(tmp2
, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
1049 v2
[i
] = _mm256_blend_ps(tmp1
, v2
[i
], 0xF0);
1051 // Need to compute 4th implied vertex for the rectangle.
1052 tmp2
= _mm256_sub_ps(v0
[i
], v1
[i
]);
1053 tmp2
= _mm256_add_ps(tmp2
, v2
[i
]); // tmp2 = { w, *, x, *, y, *, z, * }
1054 tmp2
= _mm256_permute_ps(tmp2
, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
1055 v2
[i
] = _mm256_blend_ps(v2
[i
], tmp2
, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
1058 SetNextPaState(pa
, PaRectList1
, PaRectListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
1062 //////////////////////////////////////////////////////////////////////////
1063 /// @brief State 2 for RECT_LIST topology.
1064 /// Not implemented unless there is a use case for more then 8 rects.
1065 /// @param pa - State for PA state machine.
1066 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1067 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1073 SWR_ASSERT(0); // Is rect list used for anything other then clears?
1074 SetNextPaState(pa
, PaRectList0
, PaRectListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
1078 #if ENABLE_AVX512_SIMD16
1079 //////////////////////////////////////////////////////////////////////////
1080 /// @brief State 1 for RECT_LIST topology.
1081 /// There is not enough to assemble 8 triangles.
1082 bool PaRectList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1084 SetNextPaState_simd16(pa
, PaRectList1_simd16
, PaRectListSingle0
);
1088 //////////////////////////////////////////////////////////////////////////
1089 /// @brief State 1 for RECT_LIST topology.
1090 /// Rect lists has the following format.
1092 /// v2 o---o v5 o---o v8 o---o v11 o---o
1093 /// | \ | | \ | | \ | | \ |
1094 /// v1 o---o v4 o---o v7 o---o v10 o---o
1097 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
1099 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
1100 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
1103 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
1104 /// where v0 contains all the first vertices for 8 triangles.
1107 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
1108 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
1109 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
1111 /// @param pa - State for PA state machine.
1112 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1113 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1114 bool PaRectList1_simd16(
1117 simd16vector verts
[])
1119 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 }
1120 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
); // b[] = { v16...but not used by this implementation.. }
1125 for (uint32_t i
= 0; i
< 4; i
+= 1)
1127 if (pa
.useAlternateOffset
)
1139 __m256 tmp0
, tmp1
, tmp2
;
1141 // Loop over each component in the simdvector.
1142 for (int i
= 0; i
< 4; i
+= 1)
1144 simd16vector
& v0
= verts
[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
1145 tmp0
= _mm256_permute2f128_ps(b
[i
], b
[i
], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
1146 v0
[i
].lo
= _mm256_blend_ps(a
[i
], tmp0
, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
1147 tmp1
= _mm256_permute_ps(v0
[i
].lo
, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
1148 v0
[i
].lo
= _mm256_permute_ps(v0
[i
].lo
, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
1149 v0
[i
].lo
= _mm256_blend_ps(tmp1
, v0
[i
].lo
, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
1151 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
1152 /// AVX2 should make this much cheaper.
1153 simd16vector
& v1
= verts
[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
1154 v1
[i
].lo
= _mm256_permute_ps(a
[i
], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
1155 tmp1
= _mm256_permute_ps(a
[i
], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
1156 tmp2
= _mm256_blend_ps(v1
[i
].lo
, tmp1
, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
1157 tmp1
= _mm256_permute2f128_ps(tmp2
, tmp2
, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
1158 v1
[i
].lo
= _mm256_permute_ps(tmp0
, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
1159 v1
[i
].lo
= _mm256_blend_ps(tmp2
, v1
[i
].lo
, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
1160 v1
[i
].lo
= _mm256_blend_ps(v1
[i
].lo
, tmp1
, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
1162 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
1163 simd16vector
& v2
= verts
[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
1164 v2
[i
].lo
= _mm256_permute_ps(tmp0
, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
1165 tmp1
= _mm256_permute_ps(tmp2
, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
1166 v2
[i
].lo
= _mm256_blend_ps(tmp1
, v2
[i
].lo
, 0xF0);
1168 // Need to compute 4th implied vertex for the rectangle.
1169 tmp2
= _mm256_sub_ps(v0
[i
].lo
, v1
[i
].lo
);
1170 tmp2
= _mm256_add_ps(tmp2
, v2
[i
].lo
); // tmp2 = { w, *, x, *, y, *, z, * }
1171 tmp2
= _mm256_permute_ps(tmp2
, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
1172 v2
[i
].lo
= _mm256_blend_ps(v2
[i
].lo
, tmp2
, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
1174 v0
[i
].hi
= _simd_setzero_ps();
1175 v1
[i
].hi
= _simd_setzero_ps();
1176 v2
[i
].hi
= _simd_setzero_ps();
1179 SetNextPaState_simd16(pa
, PaRectList1_simd16
, PaRectListSingle0
, 0, KNOB_SIMD16_WIDTH
, true);
1183 //////////////////////////////////////////////////////////////////////////
1184 /// @brief State 2 for RECT_LIST topology.
1185 /// Not implemented unless there is a use case for more then 8 rects.
1186 /// @param pa - State for PA state machine.
1187 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
1188 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1189 bool PaRectList2_simd16(
1192 simd16vector verts
[])
1194 SWR_ASSERT(0); // Is rect list used for anything other then clears?
1195 SetNextPaState_simd16(pa
, PaRectList0_simd16
, PaRectListSingle0
, 0, KNOB_SIMD16_WIDTH
, true);
1200 //////////////////////////////////////////////////////////////////////////
1201 /// @brief This procedure is called by the Binner to assemble the attributes.
1202 /// Unlike position, which is stored vertically, the attributes are
1203 /// stored horizontally. The outputs from the VS, labeled as 'a' and
1204 /// 'b' are vertical. This function needs to transpose the lanes
1205 /// containing the vertical attribute data into horizontal form.
1206 /// @param pa - State for PA state machine.
1207 /// @param slot - Index into VS output for a given attribute.
1208 /// @param primIndex - Binner processes each triangle individually.
1209 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
1210 void PaRectListSingle0(
1216 // We have 12 simdscalars contained within 3 simdvectors which
1217 // hold at least 8 triangles worth of data. We want to assemble a single
1218 // triangle with data in horizontal form.
1219 #if USE_SIMD16_FRONTEND
1220 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
1221 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
1226 for (uint32_t i
= 0; i
< 4; i
+= 1)
1228 if (pa
.useAlternateOffset
)
1241 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
1244 // Convert from vertical to horizontal.
1248 verts
[0] = swizzleLane0(a
);
1249 verts
[1] = swizzleLane1(a
);
1250 verts
[2] = swizzleLane2(a
);
1253 verts
[0] = swizzleLane0(a
);
1254 verts
[1] = swizzleLane2(a
);
1255 verts
[2] = _mm_blend_ps(verts
[0], verts
[1], 0x2);
1268 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT
*in_pDC
, uint32_t in_numPrims
, uint8_t* pStream
, uint32_t in_streamSizeInVerts
,
1269 bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
) : PA_STATE(in_pDC
, pStream
, in_streamSizeInVerts
), numPrims(in_numPrims
), numPrimsComplete(0), numSimdPrims(0),
1270 cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming
)
1272 const API_STATE
& state
= GetApiState(pDC
);
1274 this->binTopology
= topo
== TOP_UNKNOWN
? state
.topology
: topo
;
1276 #if ENABLE_AVX512_SIMD16
1277 pfnPaFunc_simd16
= nullptr;
1280 switch (this->binTopology
)
1282 case TOP_TRIANGLE_LIST
:
1283 this->pfnPaFunc
= PaTriList0
;
1284 #if ENABLE_AVX512_SIMD16
1285 this->pfnPaFunc_simd16
= PaTriList0_simd16
;
1288 case TOP_TRIANGLE_STRIP
:
1289 this->pfnPaFunc
= PaTriStrip0
;
1291 case TOP_TRIANGLE_FAN
:
1292 this->pfnPaFunc
= PaTriFan0
;
1295 this->pfnPaFunc
= PaQuadList0
;
1296 this->numPrims
= in_numPrims
* 2; // Convert quad primitives into triangles
1298 case TOP_QUAD_STRIP
:
1299 // quad strip pattern when decomposed into triangles is the same as verts strips
1300 this->pfnPaFunc
= PaTriStrip0
;
1301 this->numPrims
= in_numPrims
* 2; // Convert quad primitives into triangles
1304 this->pfnPaFunc
= PaLineList0
;
1305 this->numPrims
= in_numPrims
;
1307 case TOP_LINE_STRIP
:
1308 this->pfnPaFunc
= PaLineStrip0
;
1309 this->numPrims
= in_numPrims
;
1312 this->pfnPaFunc
= PaLineLoop0
;
1313 this->numPrims
= in_numPrims
;
1315 case TOP_POINT_LIST
:
1316 // use point binner and rasterizer if supported
1317 this->pfnPaFunc
= PaPoints0
;
1318 this->numPrims
= in_numPrims
;
1321 this->pfnPaFunc
= PaRectList0
;
1322 #if ENABLE_AVX512_SIMD16
1323 this->pfnPaFunc_simd16
= PaRectList0_simd16
;
1325 this->numPrims
= in_numPrims
* 2;
1328 case TOP_PATCHLIST_1
:
1329 this->pfnPaFunc
= PaPatchList
<1>;
1331 case TOP_PATCHLIST_2
:
1332 this->pfnPaFunc
= PaPatchList
<2>;
1334 case TOP_PATCHLIST_3
:
1335 this->pfnPaFunc
= PaPatchList
<3>;
1337 case TOP_PATCHLIST_4
:
1338 this->pfnPaFunc
= PaPatchList
<4>;
1340 case TOP_PATCHLIST_5
:
1341 this->pfnPaFunc
= PaPatchList
<5>;
1343 case TOP_PATCHLIST_6
:
1344 this->pfnPaFunc
= PaPatchList
<6>;
1346 case TOP_PATCHLIST_7
:
1347 this->pfnPaFunc
= PaPatchList
<7>;
1349 case TOP_PATCHLIST_8
:
1350 this->pfnPaFunc
= PaPatchList
<8>;
1352 case TOP_PATCHLIST_9
:
1353 this->pfnPaFunc
= PaPatchList
<9>;
1355 case TOP_PATCHLIST_10
:
1356 this->pfnPaFunc
= PaPatchList
<10>;
1358 case TOP_PATCHLIST_11
:
1359 this->pfnPaFunc
= PaPatchList
<11>;
1361 case TOP_PATCHLIST_12
:
1362 this->pfnPaFunc
= PaPatchList
<12>;
1364 case TOP_PATCHLIST_13
:
1365 this->pfnPaFunc
= PaPatchList
<13>;
1367 case TOP_PATCHLIST_14
:
1368 this->pfnPaFunc
= PaPatchList
<14>;
1370 case TOP_PATCHLIST_15
:
1371 this->pfnPaFunc
= PaPatchList
<15>;
1373 case TOP_PATCHLIST_16
:
1374 this->pfnPaFunc
= PaPatchList
<16>;
1376 case TOP_PATCHLIST_17
:
1377 this->pfnPaFunc
= PaPatchList
<17>;
1379 case TOP_PATCHLIST_18
:
1380 this->pfnPaFunc
= PaPatchList
<18>;
1382 case TOP_PATCHLIST_19
:
1383 this->pfnPaFunc
= PaPatchList
<19>;
1385 case TOP_PATCHLIST_20
:
1386 this->pfnPaFunc
= PaPatchList
<20>;
1388 case TOP_PATCHLIST_21
:
1389 this->pfnPaFunc
= PaPatchList
<21>;
1391 case TOP_PATCHLIST_22
:
1392 this->pfnPaFunc
= PaPatchList
<22>;
1394 case TOP_PATCHLIST_23
:
1395 this->pfnPaFunc
= PaPatchList
<23>;
1397 case TOP_PATCHLIST_24
:
1398 this->pfnPaFunc
= PaPatchList
<24>;
1400 case TOP_PATCHLIST_25
:
1401 this->pfnPaFunc
= PaPatchList
<25>;
1403 case TOP_PATCHLIST_26
:
1404 this->pfnPaFunc
= PaPatchList
<26>;
1406 case TOP_PATCHLIST_27
:
1407 this->pfnPaFunc
= PaPatchList
<27>;
1409 case TOP_PATCHLIST_28
:
1410 this->pfnPaFunc
= PaPatchList
<28>;
1412 case TOP_PATCHLIST_29
:
1413 this->pfnPaFunc
= PaPatchList
<29>;
1415 case TOP_PATCHLIST_30
:
1416 this->pfnPaFunc
= PaPatchList
<30>;
1418 case TOP_PATCHLIST_31
:
1419 this->pfnPaFunc
= PaPatchList
<31>;
1421 case TOP_PATCHLIST_32
:
1422 this->pfnPaFunc
= PaPatchList
<32>;
1430 this->pfnPaFuncReset
= this->pfnPaFunc
;
1431 #if ENABLE_AVX512_SIMD16
1432 this->pfnPaFuncReset_simd16
= this->pfnPaFunc_simd16
;
1435 #if USE_SIMD16_FRONTEND
1436 simd16scalari id16
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1437 simd16scalari id82
= _simd16_set_epi32( 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
1440 simdscalari id8
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1441 simdscalari id4
= _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
1444 switch(this->binTopology
)
1446 case TOP_TRIANGLE_LIST
:
1447 case TOP_TRIANGLE_STRIP
:
1448 case TOP_TRIANGLE_FAN
:
1449 case TOP_LINE_STRIP
:
1452 #if USE_SIMD16_FRONTEND
1453 this->primIDIncr
= 16;
1454 this->primID
= id16
;
1456 this->primIDIncr
= 8;
1461 case TOP_QUAD_STRIP
:
1463 #if USE_SIMD16_FRONTEND
1464 this->primIDIncr
= 8;
1465 this->primID
= id82
;
1467 this->primIDIncr
= 4;
1471 case TOP_POINT_LIST
:
1472 #if USE_SIMD16_FRONTEND
1473 this->primIDIncr
= 16;
1474 this->primID
= id16
;
1476 this->primIDIncr
= 8;
1480 case TOP_PATCHLIST_1
:
1481 case TOP_PATCHLIST_2
:
1482 case TOP_PATCHLIST_3
:
1483 case TOP_PATCHLIST_4
:
1484 case TOP_PATCHLIST_5
:
1485 case TOP_PATCHLIST_6
:
1486 case TOP_PATCHLIST_7
:
1487 case TOP_PATCHLIST_8
:
1488 case TOP_PATCHLIST_9
:
1489 case TOP_PATCHLIST_10
:
1490 case TOP_PATCHLIST_11
:
1491 case TOP_PATCHLIST_12
:
1492 case TOP_PATCHLIST_13
:
1493 case TOP_PATCHLIST_14
:
1494 case TOP_PATCHLIST_15
:
1495 case TOP_PATCHLIST_16
:
1496 case TOP_PATCHLIST_17
:
1497 case TOP_PATCHLIST_18
:
1498 case TOP_PATCHLIST_19
:
1499 case TOP_PATCHLIST_20
:
1500 case TOP_PATCHLIST_21
:
1501 case TOP_PATCHLIST_22
:
1502 case TOP_PATCHLIST_23
:
1503 case TOP_PATCHLIST_24
:
1504 case TOP_PATCHLIST_25
:
1505 case TOP_PATCHLIST_26
:
1506 case TOP_PATCHLIST_27
:
1507 case TOP_PATCHLIST_28
:
1508 case TOP_PATCHLIST_29
:
1509 case TOP_PATCHLIST_30
:
1510 case TOP_PATCHLIST_31
:
1511 case TOP_PATCHLIST_32
:
1512 // Always run KNOB_SIMD_WIDTH number of patches at a time.
1513 #if USE_SIMD16_FRONTEND
1514 this->primIDIncr
= 16;
1515 this->primID
= id16
;
1517 this->primIDIncr
= 8;