1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
35 #if (KNOB_SIMD_WIDTH == 8)
37 INLINE __m128
swizzleLane0(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
39 simdscalar tmp0
= _mm256_unpacklo_ps(x
, z
);
40 simdscalar tmp1
= _mm256_unpacklo_ps(y
, w
);
41 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 0);
44 INLINE __m128
swizzleLane1(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
46 simdscalar tmp0
= _mm256_unpacklo_ps(x
, z
);
47 simdscalar tmp1
= _mm256_unpacklo_ps(y
, w
);
48 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 0);
51 INLINE __m128
swizzleLane2(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
53 simdscalar tmp0
= _mm256_unpackhi_ps(x
, z
);
54 simdscalar tmp1
= _mm256_unpackhi_ps(y
, w
);
55 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 0);
58 INLINE __m128
swizzleLane3(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
60 simdscalar tmp0
= _mm256_unpackhi_ps(x
, z
);
61 simdscalar tmp1
= _mm256_unpackhi_ps(y
, w
);
62 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 0);
65 INLINE __m128
swizzleLane4(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
67 simdscalar tmp0
= _mm256_unpacklo_ps(x
, z
);
68 simdscalar tmp1
= _mm256_unpacklo_ps(y
, w
);
69 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 1);
72 INLINE __m128
swizzleLane5(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
74 simdscalar tmp0
= _mm256_unpacklo_ps(x
, z
);
75 simdscalar tmp1
= _mm256_unpacklo_ps(y
, w
);
76 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 1);
79 INLINE __m128
swizzleLane6(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
81 simdscalar tmp0
= _mm256_unpackhi_ps(x
, z
);
82 simdscalar tmp1
= _mm256_unpackhi_ps(y
, w
);
83 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 1);
86 INLINE __m128
swizzleLane7(const simdscalar
&x
, const simdscalar
&y
, const simdscalar
&z
, const simdscalar
&w
)
88 simdscalar tmp0
= _mm256_unpackhi_ps(x
, z
);
89 simdscalar tmp1
= _mm256_unpackhi_ps(y
, w
);
90 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 1);
93 INLINE __m128
swizzleLane0(const simdvector
&v
)
95 return swizzleLane0(v
.x
, v
.y
, v
.z
, v
.w
);
98 INLINE __m128
swizzleLane1(const simdvector
&v
)
100 return swizzleLane1(v
.x
, v
.y
, v
.z
, v
.w
);
103 INLINE __m128
swizzleLane2(const simdvector
&v
)
105 return swizzleLane2(v
.x
, v
.y
, v
.z
, v
.w
);
108 INLINE __m128
swizzleLane3(const simdvector
&v
)
110 return swizzleLane3(v
.x
, v
.y
, v
.z
, v
.w
);
113 INLINE __m128
swizzleLane4(const simdvector
&v
)
115 return swizzleLane4(v
.x
, v
.y
, v
.z
, v
.w
);
118 INLINE __m128
swizzleLane5(const simdvector
&v
)
120 return swizzleLane5(v
.x
, v
.y
, v
.z
, v
.w
);
123 INLINE __m128
swizzleLane6(const simdvector
&v
)
125 return swizzleLane6(v
.x
, v
.y
, v
.z
, v
.w
);
128 INLINE __m128
swizzleLane7(const simdvector
&v
)
130 return swizzleLane7(v
.x
, v
.y
, v
.z
, v
.w
);
133 INLINE __m128
swizzleLaneN(const simdvector
&v
, int lane
)
138 return swizzleLane0(v
);
140 return swizzleLane1(v
);
142 return swizzleLane2(v
);
144 return swizzleLane3(v
);
146 return swizzleLane4(v
);
148 return swizzleLane5(v
);
150 return swizzleLane6(v
);
152 return swizzleLane7(v
);
154 return _mm_setzero_ps();
158 #if ENABLE_AVX512_SIMD16
159 INLINE __m128
swizzleLane0(const simd16vector
&v
)
161 return swizzleLane0(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
164 INLINE __m128
swizzleLane1(const simd16vector
&v
)
166 return swizzleLane1(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
169 INLINE __m128
swizzleLane2(const simd16vector
&v
)
171 return swizzleLane2(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
174 INLINE __m128
swizzleLane3(const simd16vector
&v
)
176 return swizzleLane3(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
179 INLINE __m128
swizzleLane4(const simd16vector
&v
)
181 return swizzleLane4(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
184 INLINE __m128
swizzleLane5(const simd16vector
&v
)
186 return swizzleLane5(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
189 INLINE __m128
swizzleLane6(const simd16vector
&v
)
191 return swizzleLane6(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
194 INLINE __m128
swizzleLane7(const simd16vector
&v
)
196 return swizzleLane7(_simd16_extract_ps(v
.x
, 0), _simd16_extract_ps(v
.y
, 0), _simd16_extract_ps(v
.z
, 0), _simd16_extract_ps(v
.w
, 0));
199 INLINE __m128
swizzleLane8(const simd16vector
&v
)
201 return swizzleLane0(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
204 INLINE __m128
swizzleLane9(const simd16vector
&v
)
206 return swizzleLane1(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
209 INLINE __m128
swizzleLaneA(const simd16vector
&v
)
211 return swizzleLane2(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
214 INLINE __m128
swizzleLaneB(const simd16vector
&v
)
216 return swizzleLane3(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
219 INLINE __m128
swizzleLaneC(const simd16vector
&v
)
221 return swizzleLane4(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
224 INLINE __m128
swizzleLaneD(const simd16vector
&v
)
226 return swizzleLane5(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
229 INLINE __m128
swizzleLaneE(const simd16vector
&v
)
231 return swizzleLane6(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
234 INLINE __m128
swizzleLaneF(const simd16vector
&v
)
236 return swizzleLane7(_simd16_extract_ps(v
.x
, 1), _simd16_extract_ps(v
.y
, 1), _simd16_extract_ps(v
.z
, 1), _simd16_extract_ps(v
.w
, 1));
239 INLINE __m128
swizzleLaneN(const simd16vector
&v
, int lane
)
244 return swizzleLane0(v
);
246 return swizzleLane1(v
);
248 return swizzleLane2(v
);
250 return swizzleLane3(v
);
252 return swizzleLane4(v
);
254 return swizzleLane5(v
);
256 return swizzleLane6(v
);
258 return swizzleLane7(v
);
260 return swizzleLane8(v
);
262 return swizzleLane9(v
);
264 return swizzleLaneA(v
);
266 return swizzleLaneB(v
);
268 return swizzleLaneC(v
);
270 return swizzleLaneD(v
);
272 return swizzleLaneE(v
);
274 return swizzleLaneF(v
);
276 return _mm_setzero_ps();
281 bool PaTriList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
282 bool PaTriList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
283 bool PaTriList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
284 #if ENABLE_AVX512_SIMD16
285 bool PaTriList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
286 bool PaTriList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
287 bool PaTriList2_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
289 void PaTriListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
291 bool PaTriStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
292 bool PaTriStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
293 #if ENABLE_AVX512_SIMD16
294 bool PaTriStrip0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
295 bool PaTriStrip1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
297 void PaTriStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
299 bool PaTriFan0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
300 bool PaTriFan1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
301 #if ENABLE_AVX512_SIMD16
302 bool PaTriFan0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
303 bool PaTriFan1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
305 void PaTriFanSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
307 bool PaQuadList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
308 bool PaQuadList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
309 #if ENABLE_AVX512_SIMD16
310 bool PaQuadList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
311 bool PaQuadList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
313 void PaQuadListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
315 bool PaLineLoop0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
316 bool PaLineLoop1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
317 #if ENABLE_AVX512_SIMD16
318 bool PaLineLoop0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
319 bool PaLineLoop1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
321 void PaLineLoopSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
323 bool PaLineList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
324 bool PaLineList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
325 #if ENABLE_AVX512_SIMD16
326 bool PaLineList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
327 bool PaLineList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
329 void PaLineListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
331 bool PaLineStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
332 bool PaLineStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
333 #if ENABLE_AVX512_SIMD16
334 bool PaLineStrip0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
335 bool PaLineStrip1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
337 void PaLineStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
339 bool PaPoints0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
340 #if ENABLE_AVX512_SIMD16
341 bool PaPoints0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
343 void PaPointsSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
345 bool PaRectList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
346 bool PaRectList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
347 bool PaRectList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
348 #if ENABLE_AVX512_SIMD16
349 bool PaRectList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
350 bool PaRectList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
351 bool PaRectList2_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
353 void PaRectListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
355 template <uint32_t TotalControlPoints
>
356 void PaPatchListSingle(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
358 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
359 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
360 // Each attribute has 4 components.
362 /// @todo Optimize this
364 #if USE_SIMD16_FRONTEND
365 if (pa
.useAlternateOffset
)
367 primIndex
+= KNOB_SIMD_WIDTH
;
371 float* pOutVec
= (float*)verts
;
373 for (uint32_t cp
= 0; cp
< TotalControlPoints
; ++cp
)
375 uint32_t input_cp
= primIndex
* TotalControlPoints
+ cp
;
376 #if USE_SIMD16_FRONTEND
377 uint32_t input_vec
= input_cp
/ KNOB_SIMD16_WIDTH
;
378 uint32_t input_lane
= input_cp
% KNOB_SIMD16_WIDTH
;
381 uint32_t input_vec
= input_cp
/ KNOB_SIMD_WIDTH
;
382 uint32_t input_lane
= input_cp
% KNOB_SIMD_WIDTH
;
385 // Loop over all components of the attribute
386 for (uint32_t i
= 0; i
< 4; ++i
)
388 #if USE_SIMD16_FRONTEND
389 const float* pInputVec
= (const float*)(&PaGetSimdVector_simd16(pa
, input_vec
, slot
)[i
]);
391 const float* pInputVec
= (const float*)(&PaGetSimdVector(pa
, input_vec
, slot
)[i
]);
393 pOutVec
[cp
* 4 + i
] = pInputVec
[input_lane
];
398 template<uint32_t TotalControlPoints
, uint32_t CurrentControlPoints
= 1>
399 static bool PaPatchList(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
403 PaPatchList
<TotalControlPoints
, CurrentControlPoints
+ 1>,
404 PaPatchListSingle
<TotalControlPoints
>);
409 template<uint32_t TotalControlPoints
>
410 static bool PaPatchListTerm(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
412 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
413 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
414 // Each attribute has 4 components.
416 /// @todo Optimize this
418 #if USE_SIMD16_FRONTEND
419 uint32_t lane_offset
= 0;
421 if (pa
.useAlternateOffset
)
423 lane_offset
= KNOB_SIMD_WIDTH
;
427 // Loop over all components of the attribute
428 for (uint32_t i
= 0; i
< 4; ++i
)
430 for (uint32_t cp
= 0; cp
< TotalControlPoints
; ++cp
)
432 float vec
[KNOB_SIMD_WIDTH
];
433 for (uint32_t lane
= 0; lane
< KNOB_SIMD_WIDTH
; ++lane
)
435 #if USE_SIMD16_FRONTEND
436 uint32_t input_cp
= (lane
+ lane_offset
) * TotalControlPoints
+ cp
;
437 uint32_t input_vec
= input_cp
/ KNOB_SIMD16_WIDTH
;
438 uint32_t input_lane
= input_cp
% KNOB_SIMD16_WIDTH
;
440 const float* pInputVec
= (const float*)(&PaGetSimdVector_simd16(pa
, input_vec
, slot
)[i
]);
442 uint32_t input_cp
= lane
* TotalControlPoints
+ cp
;
443 uint32_t input_vec
= input_cp
/ KNOB_SIMD_WIDTH
;
444 uint32_t input_lane
= input_cp
% KNOB_SIMD_WIDTH
;
446 const float* pInputVec
= (const float*)(&PaGetSimdVector(pa
, input_vec
, slot
)[i
]);
448 vec
[lane
] = pInputVec
[input_lane
];
450 verts
[cp
][i
] = _simd_loadu_ps(vec
);
456 PaPatchList
<TotalControlPoints
>,
457 PaPatchListSingle
<TotalControlPoints
>,
459 PA_STATE_OPT::SIMD_WIDTH
,
465 #if ENABLE_AVX512_SIMD16
466 template<uint32_t TotalControlPoints
, uint32_t CurrentControlPoints
= 1>
467 static bool PaPatchList_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
469 SetNextPaState_simd16(
471 PaPatchList_simd16
<TotalControlPoints
, CurrentControlPoints
+ 1>,
472 PaPatchList
<TotalControlPoints
, CurrentControlPoints
+ 1>,
473 PaPatchListSingle
<TotalControlPoints
>);
478 template<uint32_t TotalControlPoints
>
479 static bool PaPatchListTerm_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
481 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
482 // KNOB_SIMD16_WIDTH * 1 patch. This function is called once per attribute.
483 // Each attribute has 4 components.
485 /// @todo Optimize this
487 // Loop over all components of the attribute
488 for (uint32_t i
= 0; i
< 4; ++i
)
490 for (uint32_t cp
= 0; cp
< TotalControlPoints
; ++cp
)
492 float vec
[KNOB_SIMD16_WIDTH
];
493 for (uint32_t lane
= 0; lane
< KNOB_SIMD16_WIDTH
; ++lane
)
495 uint32_t input_cp
= lane
* TotalControlPoints
+ cp
;
496 uint32_t input_vec
= input_cp
/ KNOB_SIMD16_WIDTH
;
497 uint32_t input_lane
= input_cp
% KNOB_SIMD16_WIDTH
;
499 const float* pInputVec
= (const float*)(&PaGetSimdVector(pa
, input_vec
, slot
)[i
]);
500 vec
[lane
] = pInputVec
[input_lane
];
502 verts
[cp
][i
] = _simd16_loadu_ps(vec
);
506 SetNextPaState_simd16(
508 PaPatchList_simd16
<TotalControlPoints
>,
509 PaPatchList
<TotalControlPoints
>,
510 PaPatchListSingle
<TotalControlPoints
>,
512 PA_STATE_OPT::SIMD_WIDTH
,
519 #define PA_PATCH_LIST_TERMINATOR(N) \
520 template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
521 { return PaPatchListTerm<N>(pa, slot, verts); }
522 PA_PATCH_LIST_TERMINATOR(1)
523 PA_PATCH_LIST_TERMINATOR(2)
524 PA_PATCH_LIST_TERMINATOR(3)
525 PA_PATCH_LIST_TERMINATOR(4)
526 PA_PATCH_LIST_TERMINATOR(5)
527 PA_PATCH_LIST_TERMINATOR(6)
528 PA_PATCH_LIST_TERMINATOR(7)
529 PA_PATCH_LIST_TERMINATOR(8)
530 PA_PATCH_LIST_TERMINATOR(9)
531 PA_PATCH_LIST_TERMINATOR(10)
532 PA_PATCH_LIST_TERMINATOR(11)
533 PA_PATCH_LIST_TERMINATOR(12)
534 PA_PATCH_LIST_TERMINATOR(13)
535 PA_PATCH_LIST_TERMINATOR(14)
536 PA_PATCH_LIST_TERMINATOR(15)
537 PA_PATCH_LIST_TERMINATOR(16)
538 PA_PATCH_LIST_TERMINATOR(17)
539 PA_PATCH_LIST_TERMINATOR(18)
540 PA_PATCH_LIST_TERMINATOR(19)
541 PA_PATCH_LIST_TERMINATOR(20)
542 PA_PATCH_LIST_TERMINATOR(21)
543 PA_PATCH_LIST_TERMINATOR(22)
544 PA_PATCH_LIST_TERMINATOR(23)
545 PA_PATCH_LIST_TERMINATOR(24)
546 PA_PATCH_LIST_TERMINATOR(25)
547 PA_PATCH_LIST_TERMINATOR(26)
548 PA_PATCH_LIST_TERMINATOR(27)
549 PA_PATCH_LIST_TERMINATOR(28)
550 PA_PATCH_LIST_TERMINATOR(29)
551 PA_PATCH_LIST_TERMINATOR(30)
552 PA_PATCH_LIST_TERMINATOR(31)
553 PA_PATCH_LIST_TERMINATOR(32)
554 #undef PA_PATCH_LIST_TERMINATOR
556 #if ENABLE_AVX512_SIMD16
557 #define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \
558 template<> bool PaPatchList_simd16<N, N>(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\
559 { return PaPatchListTerm_simd16<N>(pa, slot, verts); }
560 PA_PATCH_LIST_TERMINATOR_SIMD16(1)
561 PA_PATCH_LIST_TERMINATOR_SIMD16(2)
562 PA_PATCH_LIST_TERMINATOR_SIMD16(3)
563 PA_PATCH_LIST_TERMINATOR_SIMD16(4)
564 PA_PATCH_LIST_TERMINATOR_SIMD16(5)
565 PA_PATCH_LIST_TERMINATOR_SIMD16(6)
566 PA_PATCH_LIST_TERMINATOR_SIMD16(7)
567 PA_PATCH_LIST_TERMINATOR_SIMD16(8)
568 PA_PATCH_LIST_TERMINATOR_SIMD16(9)
569 PA_PATCH_LIST_TERMINATOR_SIMD16(10)
570 PA_PATCH_LIST_TERMINATOR_SIMD16(11)
571 PA_PATCH_LIST_TERMINATOR_SIMD16(12)
572 PA_PATCH_LIST_TERMINATOR_SIMD16(13)
573 PA_PATCH_LIST_TERMINATOR_SIMD16(14)
574 PA_PATCH_LIST_TERMINATOR_SIMD16(15)
575 PA_PATCH_LIST_TERMINATOR_SIMD16(16)
576 PA_PATCH_LIST_TERMINATOR_SIMD16(17)
577 PA_PATCH_LIST_TERMINATOR_SIMD16(18)
578 PA_PATCH_LIST_TERMINATOR_SIMD16(19)
579 PA_PATCH_LIST_TERMINATOR_SIMD16(20)
580 PA_PATCH_LIST_TERMINATOR_SIMD16(21)
581 PA_PATCH_LIST_TERMINATOR_SIMD16(22)
582 PA_PATCH_LIST_TERMINATOR_SIMD16(23)
583 PA_PATCH_LIST_TERMINATOR_SIMD16(24)
584 PA_PATCH_LIST_TERMINATOR_SIMD16(25)
585 PA_PATCH_LIST_TERMINATOR_SIMD16(26)
586 PA_PATCH_LIST_TERMINATOR_SIMD16(27)
587 PA_PATCH_LIST_TERMINATOR_SIMD16(28)
588 PA_PATCH_LIST_TERMINATOR_SIMD16(29)
589 PA_PATCH_LIST_TERMINATOR_SIMD16(30)
590 PA_PATCH_LIST_TERMINATOR_SIMD16(31)
591 PA_PATCH_LIST_TERMINATOR_SIMD16(32)
592 #undef PA_PATCH_LIST_TERMINATOR_SIMD16
595 bool PaTriList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
597 SetNextPaState(pa
, PaTriList1
, PaTriListSingle0
);
598 return false; // Not enough vertices to assemble 4 or 8 triangles.
601 bool PaTriList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
603 SetNextPaState(pa
, PaTriList2
, PaTriListSingle0
);
604 return false; // Not enough vertices to assemble 8 triangles.
607 bool PaTriList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
609 #if KNOB_ARCH == KNOB_ARCH_AVX
610 #if USE_SIMD16_FRONTEND
615 if (!pa
.useAlternateOffset
)
617 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
618 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
620 for (uint32_t i
= 0; i
< 4; i
+= 1)
622 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
623 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
624 c
[i
] = _simd16_extract_ps(b_16
[i
], 0);
629 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
630 const simd16vector
&c_16
= PaGetSimdVector_simd16(pa
, 2, slot
);
632 for (uint32_t i
= 0; i
< 4; i
+= 1)
634 a
[i
] = _simd16_extract_ps(b_16
[i
], 1);
635 b
[i
] = _simd16_extract_ps(c_16
[i
], 0);
636 c
[i
] = _simd16_extract_ps(c_16
[i
], 1);
641 simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
642 simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
643 simdvector
&c
= PaGetSimdVector(pa
, 2, slot
);
648 // Tri Pattern - provoking vertex is always v0
649 // v0 -> 0 3 6 9 12 15 18 21
650 // v1 -> 1 4 7 10 13 16 19 22
651 // v2 -> 2 5 8 11 14 17 20 23
653 for (int i
= 0; i
< 4; ++i
)
655 simdvector
& v0
= verts
[0];
656 v0
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x92);
657 v0
[i
] = _simd_blend_ps(v0
[i
], c
[i
], 0x24);
658 v0
[i
] = _mm256_permute_ps(v0
[i
], 0x6C);
659 s
= _mm256_permute2f128_ps(v0
[i
], v0
[i
], 0x21);
660 v0
[i
] = _simd_blend_ps(v0
[i
], s
, 0x44);
662 simdvector
& v1
= verts
[1];
663 v1
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x24);
664 v1
[i
] = _simd_blend_ps(v1
[i
], c
[i
], 0x49);
665 v1
[i
] = _mm256_permute_ps(v1
[i
], 0xB1);
666 s
= _mm256_permute2f128_ps(v1
[i
], v1
[i
], 0x21);
667 v1
[i
] = _simd_blend_ps(v1
[i
], s
, 0x66);
669 simdvector
& v2
= verts
[2];
670 v2
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x49);
671 v2
[i
] = _simd_blend_ps(v2
[i
], c
[i
], 0x92);
672 v2
[i
] = _mm256_permute_ps(v2
[i
], 0xC6);
673 s
= _mm256_permute2f128_ps(v2
[i
], v2
[i
], 0x21);
674 v2
[i
] = _simd_blend_ps(v2
[i
], s
, 0x22);
677 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
678 const simdscalari perm0
= _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
679 const simdscalari perm1
= _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
680 const simdscalari perm2
= _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
682 #if USE_SIMD16_FRONTEND
687 if (!pa
.useAlternateOffset
)
689 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
690 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
692 for (uint32_t i
= 0; i
< 4; i
+= 1)
694 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
695 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
696 c
[i
] = _simd16_extract_ps(b_16
[i
], 0);
701 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
702 const simd16vector
&c_16
= PaGetSimdVector_simd16(pa
, 2, slot
);
704 for (uint32_t i
= 0; i
< 4; i
+= 1)
706 a
[i
] = _simd16_extract_ps(b_16
[i
], 1);
707 b
[i
] = _simd16_extract_ps(c_16
[i
], 0);
708 c
[i
] = _simd16_extract_ps(c_16
[i
], 1);
713 const simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
714 const simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
715 const simdvector
&c
= PaGetSimdVector(pa
, 2, slot
);
718 // v0 -> a0 a3 a6 b1 b4 b7 c2 c5
719 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6
720 // v2 -> a2 a5 b0 b3 b6 c1 c4 c7
722 simdvector
&v0
= verts
[0];
723 simdvector
&v1
= verts
[1];
724 simdvector
&v2
= verts
[2];
726 // for simd x, y, z, and w
727 for (int i
= 0; i
< 4; ++i
)
729 simdscalar temp0
= _simd_blend_ps(_simd_blend_ps(a
[i
], b
[i
], 0x92), c
[i
], 0x24);
730 simdscalar temp1
= _simd_blend_ps(_simd_blend_ps(a
[i
], b
[i
], 0x24), c
[i
], 0x49);
731 simdscalar temp2
= _simd_blend_ps(_simd_blend_ps(a
[i
], b
[i
], 0x49), c
[i
], 0x92);
733 v0
[i
] = _simd_permute_ps(temp0
, perm0
);
734 v1
[i
] = _simd_permute_ps(temp1
, perm1
);
735 v2
[i
] = _simd_permute_ps(temp2
, perm2
);
739 SetNextPaState(pa
, PaTriList0
, PaTriListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
743 #if ENABLE_AVX512_SIMD16
744 bool PaTriList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
746 SetNextPaState_simd16(pa
, PaTriList1_simd16
, PaTriList1
, PaTriListSingle0
);
747 return false; // Not enough vertices to assemble 16 triangles
750 bool PaTriList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
752 SetNextPaState_simd16(pa
, PaTriList2_simd16
, PaTriList2
, PaTriListSingle0
);
753 return false; // Not enough vertices to assemble 16 triangles
756 bool PaTriList2_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
758 const simd16scalari perm0
= _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0);
759 const simd16scalari perm1
= _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1);
760 const simd16scalari perm2
= _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2);
762 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
763 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, 1, slot
);
764 const simd16vector
&c
= PaGetSimdVector_simd16(pa
, 2, slot
);
766 simd16vector
&v0
= verts
[0];
767 simd16vector
&v1
= verts
[1];
768 simd16vector
&v2
= verts
[2];
770 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
771 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
772 // v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
774 // for simd16 x, y, z, and w
775 for (int i
= 0; i
< 4; i
+= 1)
777 simd16scalar temp0
= _simd16_blend_ps(_simd16_blend_ps(a
[i
], b
[i
], 0x4924), c
[i
], 0x2492);
778 simd16scalar temp1
= _simd16_blend_ps(_simd16_blend_ps(a
[i
], b
[i
], 0x9249), c
[i
], 0x4924);
779 simd16scalar temp2
= _simd16_blend_ps(_simd16_blend_ps(a
[i
], b
[i
], 0x2492), c
[i
], 0x9249);
781 v0
[i
] = _simd16_permute_ps(temp0
, perm0
);
782 v1
[i
] = _simd16_permute_ps(temp1
, perm1
);
783 v2
[i
] = _simd16_permute_ps(temp2
, perm2
);
786 SetNextPaState_simd16(pa
, PaTriList0_simd16
, PaTriList0
, PaTriListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
791 void PaTriListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
793 #if USE_SIMD16_FRONTEND
794 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
795 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, 1, slot
);
796 const simd16vector
&c
= PaGetSimdVector_simd16(pa
, 2, slot
);
798 if (pa
.useAlternateOffset
)
800 primIndex
+= KNOB_SIMD_WIDTH
;
803 // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
804 // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
805 // v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
810 verts
[0] = swizzleLane0(a
);
811 verts
[1] = swizzleLane1(a
);
812 verts
[2] = swizzleLane2(a
);
815 verts
[0] = swizzleLane3(a
);
816 verts
[1] = swizzleLane4(a
);
817 verts
[2] = swizzleLane5(a
);
820 verts
[0] = swizzleLane6(a
);
821 verts
[1] = swizzleLane7(a
);
822 verts
[2] = swizzleLane8(a
);
825 verts
[0] = swizzleLane9(a
);
826 verts
[1] = swizzleLaneA(a
);
827 verts
[2] = swizzleLaneB(a
);
830 verts
[0] = swizzleLaneC(a
);
831 verts
[1] = swizzleLaneD(a
);
832 verts
[2] = swizzleLaneE(a
);
835 verts
[0] = swizzleLaneF(a
);
836 verts
[1] = swizzleLane0(b
);
837 verts
[2] = swizzleLane1(b
);
840 verts
[0] = swizzleLane2(b
);
841 verts
[1] = swizzleLane3(b
);
842 verts
[2] = swizzleLane4(b
);
845 verts
[0] = swizzleLane5(b
);
846 verts
[1] = swizzleLane6(b
);
847 verts
[2] = swizzleLane7(b
);
850 verts
[0] = swizzleLane8(b
);
851 verts
[1] = swizzleLane9(b
);
852 verts
[2] = swizzleLaneA(b
);
855 verts
[0] = swizzleLaneB(b
);
856 verts
[1] = swizzleLaneC(b
);
857 verts
[2] = swizzleLaneD(b
);
860 verts
[0] = swizzleLaneE(b
);
861 verts
[1] = swizzleLaneF(b
);
862 verts
[2] = swizzleLane0(c
);
865 verts
[0] = swizzleLane1(c
);
866 verts
[1] = swizzleLane2(c
);
867 verts
[2] = swizzleLane3(c
);
870 verts
[0] = swizzleLane4(c
);
871 verts
[1] = swizzleLane5(c
);
872 verts
[2] = swizzleLane6(c
);
875 verts
[0] = swizzleLane7(c
);
876 verts
[1] = swizzleLane8(c
);
877 verts
[2] = swizzleLane9(c
);
880 verts
[0] = swizzleLaneA(c
);
881 verts
[1] = swizzleLaneB(c
);
882 verts
[2] = swizzleLaneC(c
);
885 verts
[0] = swizzleLaneD(c
);
886 verts
[1] = swizzleLaneE(c
);
887 verts
[2] = swizzleLaneF(c
);
891 // We have 12 simdscalars contained within 3 simdvectors which
892 // hold at least 8 triangles worth of data. We want to assemble a single
893 // triangle with data in horizontal form.
895 const simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
896 const simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
897 const simdvector
&c
= PaGetSimdVector(pa
, 2, slot
);
899 // Convert from vertical to horizontal.
900 // Tri Pattern - provoking vertex is always v0
901 // v0 -> 0 3 6 9 12 15 18 21
902 // v1 -> 1 4 7 10 13 16 19 22
903 // v2 -> 2 5 8 11 14 17 20 23
908 verts
[0] = swizzleLane0(a
);
909 verts
[1] = swizzleLane1(a
);
910 verts
[2] = swizzleLane2(a
);
913 verts
[0] = swizzleLane3(a
);
914 verts
[1] = swizzleLane4(a
);
915 verts
[2] = swizzleLane5(a
);
918 verts
[0] = swizzleLane6(a
);
919 verts
[1] = swizzleLane7(a
);
920 verts
[2] = swizzleLane0(b
);
923 verts
[0] = swizzleLane1(b
);
924 verts
[1] = swizzleLane2(b
);
925 verts
[2] = swizzleLane3(b
);
928 verts
[0] = swizzleLane4(b
);
929 verts
[1] = swizzleLane5(b
);
930 verts
[2] = swizzleLane6(b
);
933 verts
[0] = swizzleLane7(b
);
934 verts
[1] = swizzleLane0(c
);
935 verts
[2] = swizzleLane1(c
);
938 verts
[0] = swizzleLane2(c
);
939 verts
[1] = swizzleLane3(c
);
940 verts
[2] = swizzleLane4(c
);
943 verts
[0] = swizzleLane5(c
);
944 verts
[1] = swizzleLane6(c
);
945 verts
[2] = swizzleLane7(c
);
951 bool PaTriStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
953 SetNextPaState(pa
, PaTriStrip1
, PaTriStripSingle0
);
954 return false; // Not enough vertices to assemble 8 triangles.
957 bool PaTriStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
959 #if USE_SIMD16_FRONTEND
963 if (!pa
.useAlternateOffset
)
965 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
967 for (uint32_t i
= 0; i
< 4; i
+= 1)
969 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
970 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
975 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
977 for (uint32_t i
= 0; i
< 4; i
+= 1)
979 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
980 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);
985 simdvector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
986 simdvector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
991 for(int i
= 0; i
< 4; ++i
)
993 simdscalar a0
= a
[i
];
994 simdscalar b0
= b
[i
];
996 // Tri Pattern - provoking vertex is always v0
1000 simdvector
& v0
= verts
[0];
1004 s
= _simd_permute2f128_ps(a0
, b0
, 0x21);
1006 s
= _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(1, 0, 3, 2));
1008 simdvector
& v1
= verts
[1];
1010 v1
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(3, 1, 3, 1));
1012 simdvector
& v2
= verts
[2];
1014 v2
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(2, 2, 2, 2));
1017 SetNextPaState(pa
, PaTriStrip1
, PaTriStripSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
1021 #if ENABLE_AVX512_SIMD16
1022 bool PaTriStrip0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1024 SetNextPaState_simd16(pa
, PaTriStrip1_simd16
, PaTriStrip1
, PaTriStripSingle0
);
1025 return false; // Not enough vertices to assemble 16 triangles.
1028 bool PaTriStrip1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1030 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
1031 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
1033 simd16vector
&v0
= verts
[0];
1034 simd16vector
&v1
= verts
[1];
1035 simd16vector
&v2
= verts
[2];
1037 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1038 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1039 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1041 // for simd16 x, y, z, and w
1042 for (int i
= 0; i
< 4; i
+= 1)
1044 simd16scalar perm0
= _simd16_permute2f128_ps(a
[i
], a
[i
], 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
1045 simd16scalar perm1
= _simd16_permute2f128_ps(b
[i
], b
[i
], 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1047 simd16scalar blend
= _simd16_blend_ps(perm0
, perm1
, 0xF000); // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
1048 simd16scalar shuff
= _simd16_shuffle_ps(a
[i
], blend
, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
1050 v0
[i
] = a
[i
]; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1051 v1
[i
] = _simd16_shuffle_ps(a
[i
], shuff
, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1052 v2
[i
] = _simd16_shuffle_ps(a
[i
], shuff
, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1055 SetNextPaState_simd16(pa
, PaTriStrip1_simd16
, PaTriStrip1
, PaTriStripSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
1060 void PaTriStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1062 #if USE_SIMD16_FRONTEND
1063 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
1064 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
1066 if (pa
.useAlternateOffset
)
1068 primIndex
+= KNOB_SIMD_WIDTH
;
1071 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1072 // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1073 // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1078 verts
[0] = swizzleLane0(a
);
1079 verts
[1] = swizzleLane1(a
);
1080 verts
[2] = swizzleLane2(a
);
1083 verts
[0] = swizzleLane1(a
);
1084 verts
[1] = swizzleLane3(a
);
1085 verts
[2] = swizzleLane2(a
);
1088 verts
[0] = swizzleLane2(a
);
1089 verts
[1] = swizzleLane3(a
);
1090 verts
[2] = swizzleLane4(a
);
1093 verts
[0] = swizzleLane3(a
);
1094 verts
[1] = swizzleLane5(a
);
1095 verts
[2] = swizzleLane4(a
);
1098 verts
[0] = swizzleLane4(a
);
1099 verts
[1] = swizzleLane5(a
);
1100 verts
[2] = swizzleLane6(a
);
1103 verts
[0] = swizzleLane5(a
);
1104 verts
[1] = swizzleLane7(a
);
1105 verts
[2] = swizzleLane6(a
);
1108 verts
[0] = swizzleLane6(a
);
1109 verts
[1] = swizzleLane7(a
);
1110 verts
[2] = swizzleLane8(a
);
1113 verts
[0] = swizzleLane7(a
);
1114 verts
[1] = swizzleLane9(a
);
1115 verts
[2] = swizzleLane8(a
);
1118 verts
[0] = swizzleLane8(a
);
1119 verts
[1] = swizzleLane9(a
);
1120 verts
[2] = swizzleLaneA(a
);
1123 verts
[0] = swizzleLane9(a
);
1124 verts
[1] = swizzleLaneB(a
);
1125 verts
[2] = swizzleLaneA(a
);
1128 verts
[0] = swizzleLaneA(a
);
1129 verts
[1] = swizzleLaneB(a
);
1130 verts
[2] = swizzleLaneC(a
);
1133 verts
[0] = swizzleLaneB(a
);
1134 verts
[1] = swizzleLaneD(a
);
1135 verts
[2] = swizzleLaneC(a
);
1138 verts
[0] = swizzleLaneC(a
);
1139 verts
[1] = swizzleLaneD(a
);
1140 verts
[2] = swizzleLaneE(a
);
1143 verts
[0] = swizzleLaneD(a
);
1144 verts
[1] = swizzleLaneF(a
);
1145 verts
[2] = swizzleLaneE(a
);
1148 verts
[0] = swizzleLaneE(a
);
1149 verts
[1] = swizzleLaneF(a
);
1150 verts
[2] = swizzleLane0(b
);
1153 verts
[0] = swizzleLaneF(a
);
1154 verts
[1] = swizzleLane1(b
);
1155 verts
[2] = swizzleLane0(b
);
1159 const simdvector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
1160 const simdvector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
1162 // Convert from vertical to horizontal.
1163 // Tri Pattern - provoking vertex is always v0
1171 verts
[0] = swizzleLane0(a
);
1172 verts
[1] = swizzleLane1(a
);
1173 verts
[2] = swizzleLane2(a
);
1176 verts
[0] = swizzleLane1(a
);
1177 verts
[1] = swizzleLane3(a
);
1178 verts
[2] = swizzleLane2(a
);
1181 verts
[0] = swizzleLane2(a
);
1182 verts
[1] = swizzleLane3(a
);
1183 verts
[2] = swizzleLane4(a
);
1186 verts
[0] = swizzleLane3(a
);
1187 verts
[1] = swizzleLane5(a
);
1188 verts
[2] = swizzleLane4(a
);
1191 verts
[0] = swizzleLane4(a
);
1192 verts
[1] = swizzleLane5(a
);
1193 verts
[2] = swizzleLane6(a
);
1196 verts
[0] = swizzleLane5(a
);
1197 verts
[1] = swizzleLane7(a
);
1198 verts
[2] = swizzleLane6(a
);
1201 verts
[0] = swizzleLane6(a
);
1202 verts
[1] = swizzleLane7(a
);
1203 verts
[2] = swizzleLane0(b
);
1206 verts
[0] = swizzleLane7(a
);
1207 verts
[1] = swizzleLane1(b
);
1208 verts
[2] = swizzleLane0(b
);
1214 bool PaTriFan0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1216 SetNextPaState(pa
, PaTriFan1
, PaTriFanSingle0
);
1217 return false; // Not enough vertices to assemble 8 triangles.
1220 bool PaTriFan1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1222 #if USE_SIMD16_FRONTEND
1223 simdvector leadVert
;
1227 const simd16vector
&leadvert_16
= PaGetSimdVector_simd16(pa
, pa
.first
, slot
);
1229 if (!pa
.useAlternateOffset
)
1231 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
1233 for (uint32_t i
= 0; i
< 4; i
+= 1)
1235 leadVert
[i
] = _simd16_extract_ps(leadvert_16
[i
], 0);
1237 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
1238 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
1243 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
1245 for (uint32_t i
= 0; i
< 4; i
+= 1)
1247 leadVert
[i
] = _simd16_extract_ps(leadvert_16
[i
], 0);
1249 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
1250 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);
1255 const simdvector
&leadVert
= PaGetSimdVector(pa
, pa
.first
, slot
);
1256 const simdvector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
1257 const simdvector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
1262 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
1263 for(int i
= 0; i
< 4; ++i
)
1265 simdscalar a0
= a
[i
];
1266 simdscalar b0
= b
[i
];
1268 simdscalar comp
= leadVert
[i
];
1270 simdvector
& v0
= verts
[0];
1271 v0
[i
] = _simd_shuffle_ps(comp
, comp
, _MM_SHUFFLE(0, 0, 0, 0));
1272 v0
[i
] = _simd_permute2f128_ps(v0
[i
], comp
, 0x00);
1274 simdvector
& v2
= verts
[2];
1275 s
= _simd_permute2f128_ps(a0
, b0
, 0x21);
1276 v2
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(1, 0, 3, 2));
1278 simdvector
& v1
= verts
[1];
1279 v1
[i
] = _simd_shuffle_ps(a0
, v2
[i
], _MM_SHUFFLE(2, 1, 2, 1));
1282 SetNextPaState(pa
, PaTriFan1
, PaTriFanSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
1286 #if ENABLE_AVX512_SIMD16
1287 bool PaTriFan0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1289 SetNextPaState_simd16(pa
, PaTriFan1_simd16
, PaTriFan1
, PaTriFanSingle0
);
1290 return false; // Not enough vertices to assemble 16 triangles.
1293 bool PaTriFan1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1295 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, pa
.first
, slot
);
1296 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
1297 const simd16vector
&c
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
1299 simd16vector
&v0
= verts
[0];
1300 simd16vector
&v1
= verts
[1];
1301 simd16vector
&v2
= verts
[2];
1303 // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1304 // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1305 // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1307 // for simd16 x, y, z, and w
1308 for (uint32_t i
= 0; i
< 4; i
+= 1)
1310 simd16scalar shuff
= _simd16_shuffle_ps(a
[i
], a
[i
], _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
1312 v0
[i
] = _simd16_permute2f128_ps(shuff
, shuff
, 0x00); // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1314 simd16scalar temp0
= _simd16_permute2f128_ps(b
[i
], b
[i
], 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1315 simd16scalar temp1
= _simd16_permute2f128_ps(c
[i
], c
[i
], 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
1317 simd16scalar blend
= _simd16_blend_ps(temp0
, temp1
, 0xF000); // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
1319 v2
[i
] = _simd16_shuffle_ps(b
[i
], blend
, _MM_SHUFFLE(1, 0, 3, 2)); // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1320 v1
[i
] = _simd16_shuffle_ps(b
[i
], v2
[i
], _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1323 SetNextPaState_simd16(pa
, PaTriFan1_simd16
, PaTriFan1
, PaTriFanSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
1328 void PaTriFanSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1330 #if USE_SIMD16_FRONTEND
1331 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, pa
.first
, slot
);
1332 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
1333 const simd16vector
&c
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
1335 if (pa
.useAlternateOffset
)
1337 primIndex
+= KNOB_SIMD_WIDTH
;
1340 // v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1341 // v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1342 // v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1344 // vert 0 from leading vertex
1345 verts
[0] = swizzleLane0(a
);
1350 verts
[1] = swizzleLaneN(b
, primIndex
+ 1);
1354 verts
[1] = swizzleLane0(c
);
1360 verts
[2] = swizzleLaneN(b
, primIndex
+ 2);
1364 verts
[2] = swizzleLaneN(c
, primIndex
- 14);
1367 const simdvector
&a
= PaGetSimdVector(pa
, pa
.first
, slot
);
1368 const simdvector
&b
= PaGetSimdVector(pa
, pa
.prev
, slot
);
1369 const simdvector
&c
= PaGetSimdVector(pa
, pa
.cur
, slot
);
1371 // vert 0 from leading vertex
1372 verts
[0] = swizzleLane0(a
);
1377 verts
[1] = swizzleLaneN(b
, primIndex
+ 1);
1381 verts
[1] = swizzleLane0(c
);
1387 verts
[2] = swizzleLaneN(b
, primIndex
+ 2);
1391 verts
[2] = swizzleLaneN(c
, primIndex
- 6);
1396 bool PaQuadList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1398 SetNextPaState(pa
, PaQuadList1
, PaQuadListSingle0
);
1399 return false; // Not enough vertices to assemble 8 triangles.
1402 bool PaQuadList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1404 #if USE_SIMD16_FRONTEND
1408 if (!pa
.useAlternateOffset
)
1410 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
1412 for (uint32_t i
= 0; i
< 4; i
+= 1)
1414 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
1415 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
1420 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
1422 for (uint32_t i
= 0; i
< 4; i
+= 1)
1424 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
1425 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);
1430 simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
1431 simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
1436 for(int i
= 0; i
< 4; ++i
)
1438 simdscalar a0
= a
[i
];
1439 simdscalar b0
= b
[i
];
1441 s1
= _mm256_permute2f128_ps(a0
, b0
, 0x20);
1442 s2
= _mm256_permute2f128_ps(a0
, b0
, 0x31);
1444 simdvector
& v0
= verts
[0];
1445 v0
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(0, 0, 0, 0));
1447 simdvector
& v1
= verts
[1];
1448 v1
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(2, 1, 2, 1));
1450 simdvector
& v2
= verts
[2];
1451 v2
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(3, 2, 3, 2));
1454 SetNextPaState(pa
, PaQuadList0
, PaQuadListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
1458 #if ENABLE_AVX512_SIMD16
1459 bool PaQuadList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1461 SetNextPaState_simd16(pa
, PaQuadList1_simd16
, PaQuadList1
, PaQuadListSingle0
);
1462 return false; // Not enough vertices to assemble 16 triangles.
1465 bool PaQuadList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1467 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
1468 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, 1, slot
);
1470 simd16vector
&v0
= verts
[0];
1471 simd16vector
&v1
= verts
[1];
1472 simd16vector
&v2
= verts
[2];
1474 // v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
1475 // v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
1476 // v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1478 // for simd16 x, y, z, and w
1479 for (uint32_t i
= 0; i
< 4; i
+= 1)
1481 simd16scalar temp0
= _simd16_permute2f128_ps(a
[i
], b
[i
], 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
1482 simd16scalar temp1
= _simd16_permute2f128_ps(a
[i
], b
[i
], 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1484 v0
[i
] = _simd16_shuffle_ps(temp0
, temp1
, _MM_SHUFFLE(0, 0, 0, 0)); // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
1485 v1
[i
] = _simd16_shuffle_ps(temp0
, temp1
, _MM_SHUFFLE(2, 1, 2, 1)); // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
1486 v2
[i
] = _simd16_shuffle_ps(temp0
, temp1
, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1489 SetNextPaState_simd16(pa
, PaQuadList0_simd16
, PaQuadList0
, PaQuadListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
1494 void PaQuadListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1496 #if USE_SIMD16_FRONTEND
1497 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
1498 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, 1, slot
);
1500 if (pa
.useAlternateOffset
)
1502 primIndex
+= KNOB_SIMD_WIDTH
;
1508 // triangle 0 - 0 1 2
1509 verts
[0] = swizzleLane0(a
);
1510 verts
[1] = swizzleLane1(a
);
1511 verts
[2] = swizzleLane2(a
);
1514 // triangle 1 - 0 2 3
1515 verts
[0] = swizzleLane0(a
);
1516 verts
[1] = swizzleLane2(a
);
1517 verts
[2] = swizzleLane3(a
);
1520 // triangle 2 - 4 5 6
1521 verts
[0] = swizzleLane4(a
);
1522 verts
[1] = swizzleLane5(a
);
1523 verts
[2] = swizzleLane6(a
);
1526 // triangle 3 - 4 6 7
1527 verts
[0] = swizzleLane4(a
);
1528 verts
[1] = swizzleLane6(a
);
1529 verts
[2] = swizzleLane7(a
);
1532 // triangle 4 - 8 9 A
1533 verts
[0] = swizzleLane8(a
);
1534 verts
[1] = swizzleLane9(a
);
1535 verts
[2] = swizzleLaneA(a
);
1538 // triangle 5 - 8 A B
1539 verts
[0] = swizzleLane8(a
);
1540 verts
[1] = swizzleLaneA(a
);
1541 verts
[2] = swizzleLaneB(a
);
1544 // triangle 6 - C D E
1545 verts
[0] = swizzleLaneC(a
);
1546 verts
[1] = swizzleLaneD(a
);
1547 verts
[2] = swizzleLaneE(a
);
1550 // triangle 7 - C E F
1551 verts
[0] = swizzleLaneC(a
);
1552 verts
[1] = swizzleLaneE(a
);
1553 verts
[2] = swizzleLaneF(a
);
1556 // triangle 0 - 0 1 2
1557 verts
[0] = swizzleLane0(b
);
1558 verts
[1] = swizzleLane1(b
);
1559 verts
[2] = swizzleLane2(b
);
1562 // triangle 1 - 0 2 3
1563 verts
[0] = swizzleLane0(b
);
1564 verts
[1] = swizzleLane2(b
);
1565 verts
[2] = swizzleLane3(b
);
1568 // triangle 2 - 4 5 6
1569 verts
[0] = swizzleLane4(b
);
1570 verts
[1] = swizzleLane5(b
);
1571 verts
[2] = swizzleLane6(b
);
1574 // triangle 3 - 4 6 7
1575 verts
[0] = swizzleLane4(b
);
1576 verts
[1] = swizzleLane6(b
);
1577 verts
[2] = swizzleLane7(b
);
1580 // triangle 4 - 8 9 A
1581 verts
[0] = swizzleLane8(b
);
1582 verts
[1] = swizzleLane9(b
);
1583 verts
[2] = swizzleLaneA(b
);
1586 // triangle 5 - 8 A B
1587 verts
[0] = swizzleLane8(b
);
1588 verts
[1] = swizzleLaneA(b
);
1589 verts
[2] = swizzleLaneB(b
);
1592 // triangle 6 - C D E
1593 verts
[0] = swizzleLaneC(b
);
1594 verts
[1] = swizzleLaneD(b
);
1595 verts
[2] = swizzleLaneE(b
);
1598 // triangle 7 - C E F
1599 verts
[0] = swizzleLaneC(b
);
1600 verts
[1] = swizzleLaneE(b
);
1601 verts
[2] = swizzleLaneF(b
);
1605 const simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
1606 const simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
1611 // triangle 0 - 0 1 2
1612 verts
[0] = swizzleLane0(a
);
1613 verts
[1] = swizzleLane1(a
);
1614 verts
[2] = swizzleLane2(a
);
1617 // triangle 1 - 0 2 3
1618 verts
[0] = swizzleLane0(a
);
1619 verts
[1] = swizzleLane2(a
);
1620 verts
[2] = swizzleLane3(a
);
1623 // triangle 2 - 4 5 6
1624 verts
[0] = swizzleLane4(a
);
1625 verts
[1] = swizzleLane5(a
);
1626 verts
[2] = swizzleLane6(a
);
1629 // triangle 3 - 4 6 7
1630 verts
[0] = swizzleLane4(a
);
1631 verts
[1] = swizzleLane6(a
);
1632 verts
[2] = swizzleLane7(a
);
1635 // triangle 4 - 8 9 10 (0 1 2)
1636 verts
[0] = swizzleLane0(b
);
1637 verts
[1] = swizzleLane1(b
);
1638 verts
[2] = swizzleLane2(b
);
1641 // triangle 1 - 0 2 3
1642 verts
[0] = swizzleLane0(b
);
1643 verts
[1] = swizzleLane2(b
);
1644 verts
[2] = swizzleLane3(b
);
1647 // triangle 2 - 4 5 6
1648 verts
[0] = swizzleLane4(b
);
1649 verts
[1] = swizzleLane5(b
);
1650 verts
[2] = swizzleLane6(b
);
1653 // triangle 3 - 4 6 7
1654 verts
[0] = swizzleLane4(b
);
1655 verts
[1] = swizzleLane6(b
);
1656 verts
[2] = swizzleLane7(b
);
1662 bool PaLineLoop0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1664 SetNextPaState(pa
, PaLineLoop1
, PaLineLoopSingle0
);
1668 bool PaLineLoop1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1670 PaLineStrip1(pa
, slot
, verts
);
1672 if (pa
.numPrimsComplete
+ KNOB_SIMD_WIDTH
> pa
.numPrims
- 1)
1674 // loop reconnect now
1675 const int lane
= pa
.numPrims
- pa
.numPrimsComplete
- 1;
1677 #if USE_SIMD16_FRONTEND
1680 const simd16vector
&first_16
= PaGetSimdVector_simd16(pa
, pa
.first
, slot
);
1682 if (!pa
.useAlternateOffset
)
1684 for (uint32_t i
= 0; i
< 4; i
+= 1)
1686 first
[i
] = _simd16_extract_ps(first_16
[i
], 0);
1691 for (uint32_t i
= 0; i
< 4; i
+= 1)
1693 first
[i
] = _simd16_extract_ps(first_16
[i
], 1);
1698 simdvector
&first
= PaGetSimdVector(pa
, pa
.first
, slot
);
1701 for (int i
= 0; i
< 4; i
++)
1703 float *firstVtx
= (float *)&(first
[i
]);
1704 float *targetVtx
= (float *)&(verts
[1][i
]);
1705 targetVtx
[lane
] = firstVtx
[0];
1709 SetNextPaState(pa
, PaLineLoop1
, PaLineLoopSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
1713 #if ENABLE_AVX512_SIMD16
1714 bool PaLineLoop0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1716 SetNextPaState_simd16(pa
, PaLineLoop1_simd16
, PaLineLoop1
, PaLineLoopSingle0
);
1720 bool PaLineLoop1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1722 PaLineStrip1_simd16(pa
, slot
, verts
);
1724 if (pa
.numPrimsComplete
+ KNOB_SIMD16_WIDTH
> pa
.numPrims
- 1)
1726 // loop reconnect now
1727 const int lane
= pa
.numPrims
- pa
.numPrimsComplete
- 1;
1729 const simd16vector
&first
= PaGetSimdVector_simd16(pa
, pa
.first
, slot
);
1731 for (int i
= 0; i
< 4; i
++)
1733 float *firstVtx
= (float *)&(first
[i
]);
1734 float *targetVtx
= (float *)&(verts
[1][i
]);
1735 targetVtx
[lane
] = firstVtx
[0];
1739 SetNextPaState_simd16(pa
, PaLineLoop1_simd16
, PaLineLoop1
, PaLineLoopSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
1744 void PaLineLoopSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1746 PaLineStripSingle0(pa
, slot
, primIndex
, verts
);
1748 if (pa
.numPrimsComplete
+ primIndex
== pa
.numPrims
- 1)
1750 #if USE_SIMD16_FRONTEND
1751 const simd16vector
&first
= PaGetSimdVector_simd16(pa
, pa
.first
, slot
);
1753 verts
[1] = swizzleLane0(first
);
1755 const simdvector
&first
= PaGetSimdVector(pa
, pa
.first
, slot
);
1757 verts
[1] = swizzleLane0(first
);
1762 bool PaLineList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1764 SetNextPaState(pa
, PaLineList1
, PaLineListSingle0
);
1765 return false; // Not enough vertices to assemble 8 lines
1768 bool PaLineList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1770 #if USE_SIMD16_FRONTEND
1774 if (!pa
.useAlternateOffset
)
1776 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
1778 for (uint32_t i
= 0; i
< 4; i
+= 1)
1780 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
1781 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
1786 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
1788 for (uint32_t i
= 0; i
< 4; i
+= 1)
1790 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
1791 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);
1796 simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
1797 simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
1800 /// @todo: verify provoking vertex is correct
1801 // Line list 0 1 2 3 4 5 6 7
1802 // 8 9 10 11 12 13 14 15
1805 // 0 2 4 6 8 10 12 14
1806 // 1 3 5 7 9 11 13 15
1808 for (uint32_t i
= 0; i
< 4; ++i
)
1810 // 0 1 2 3 8 9 10 11
1811 __m256 vALowBLow
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x20);
1812 // 4 5 6 7 12 13 14 15
1813 __m256 vAHighBHigh
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x31);
1815 // 0 2 4 6 8 10 12 14
1816 verts
[0].v
[i
] = _mm256_shuffle_ps(vALowBLow
, vAHighBHigh
, _MM_SHUFFLE(2, 0, 2, 0));
1817 // 1 3 5 7 9 11 13 15
1818 verts
[1].v
[i
] = _mm256_shuffle_ps(vALowBLow
, vAHighBHigh
, _MM_SHUFFLE(3, 1, 3, 1));
1821 SetNextPaState(pa
, PaLineList0
, PaLineListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
1825 #if ENABLE_AVX512_SIMD16
1826 bool PaLineList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1828 SetNextPaState_simd16(pa
, PaLineList1_simd16
, PaLineList1
, PaLineListSingle0
);
1829 return false; // Not enough vertices to assemble 16 lines
1832 bool PaLineList1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
1834 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
1835 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, 1, slot
);
1837 simd16vector
&v0
= verts
[0];
1838 simd16vector
&v1
= verts
[1];
1840 // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1841 // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
1843 // for simd16 x, y, z, and w
1844 for (int i
= 0; i
< 4; i
+= 1)
1846 simd16scalar temp0
= _simd16_permute2f128_ps(a
[i
], b
[i
], 0x88); // (2 0 2 0) 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
1847 simd16scalar temp1
= _simd16_permute2f128_ps(a
[i
], b
[i
], 0xDD); // (3 1 3 1) 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1849 v0
[i
] = _simd16_shuffle_ps(temp0
, temp1
, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1850 v1
[i
] = _simd16_shuffle_ps(temp0
, temp1
, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
1853 SetNextPaState_simd16(pa
, PaLineList0_simd16
, PaLineList0
, PaLineListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
1858 void PaLineListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1860 #if USE_SIMD16_FRONTEND
1861 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
1862 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, 1, slot
);
1864 if (pa
.useAlternateOffset
)
1866 primIndex
+= KNOB_SIMD_WIDTH
;
1872 verts
[0] = swizzleLane0(a
);
1873 verts
[1] = swizzleLane1(a
);
1876 verts
[0] = swizzleLane2(a
);
1877 verts
[1] = swizzleLane3(a
);
1880 verts
[0] = swizzleLane4(a
);
1881 verts
[1] = swizzleLane5(a
);
1884 verts
[0] = swizzleLane6(a
);
1885 verts
[1] = swizzleLane7(a
);
1888 verts
[0] = swizzleLane8(a
);
1889 verts
[1] = swizzleLane9(a
);
1892 verts
[0] = swizzleLaneA(a
);
1893 verts
[1] = swizzleLaneB(a
);
1896 verts
[0] = swizzleLaneC(a
);
1897 verts
[1] = swizzleLaneD(a
);
1900 verts
[0] = swizzleLaneE(a
);
1901 verts
[1] = swizzleLaneF(a
);
1904 verts
[0] = swizzleLane0(b
);
1905 verts
[1] = swizzleLane1(b
);
1908 verts
[0] = swizzleLane2(b
);
1909 verts
[1] = swizzleLane3(b
);
1912 verts
[0] = swizzleLane4(b
);
1913 verts
[1] = swizzleLane5(b
);
1916 verts
[0] = swizzleLane6(b
);
1917 verts
[1] = swizzleLane7(b
);
1920 verts
[0] = swizzleLane8(b
);
1921 verts
[1] = swizzleLane9(b
);
1924 verts
[0] = swizzleLaneA(b
);
1925 verts
[1] = swizzleLaneB(b
);
1928 verts
[0] = swizzleLaneC(b
);
1929 verts
[1] = swizzleLaneD(b
);
1932 verts
[0] = swizzleLaneE(b
);
1933 verts
[1] = swizzleLaneF(b
);
1937 const simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
1938 const simdvector
&b
= PaGetSimdVector(pa
, 1, slot
);
1943 verts
[0] = swizzleLane0(a
);
1944 verts
[1] = swizzleLane1(a
);
1947 verts
[0] = swizzleLane2(a
);
1948 verts
[1] = swizzleLane3(a
);
1951 verts
[0] = swizzleLane4(a
);
1952 verts
[1] = swizzleLane5(a
);
1955 verts
[0] = swizzleLane6(a
);
1956 verts
[1] = swizzleLane7(a
);
1959 verts
[0] = swizzleLane0(b
);
1960 verts
[1] = swizzleLane1(b
);
1963 verts
[0] = swizzleLane2(b
);
1964 verts
[1] = swizzleLane3(b
);
1967 verts
[0] = swizzleLane4(b
);
1968 verts
[1] = swizzleLane5(b
);
1971 verts
[0] = swizzleLane6(b
);
1972 verts
[1] = swizzleLane7(b
);
1978 bool PaLineStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1980 SetNextPaState(pa
, PaLineStrip1
, PaLineStripSingle0
);
1981 return false; // Not enough vertices to assemble 8 lines
1984 bool PaLineStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
1986 #if USE_SIMD16_FRONTEND
1990 if (!pa
.useAlternateOffset
)
1992 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
1994 for (uint32_t i
= 0; i
< 4; i
+= 1)
1996 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
1997 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
2002 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
2004 for (uint32_t i
= 0; i
< 4; i
+= 1)
2006 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
2007 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);
2012 simdvector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
2013 simdvector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
2016 /// @todo: verify provoking vertex is correct
2017 // Line list 0 1 2 3 4 5 6 7
2018 // 8 9 10 11 12 13 14 15
2026 for(uint32_t i
= 0; i
< 4; ++i
)
2029 __m256 vPermA
= _mm256_permute_ps(a
.v
[i
], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
2030 // 4 5 6 7 8 9 10 11
2031 __m256 vAHighBLow
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x21);
2034 __m256 vPermB
= _mm256_permute_ps(vAHighBLow
, 0); // indices hi->low (0 0 0 0)
2036 verts
[1].v
[i
] = _mm256_blend_ps(vPermA
, vPermB
, 0x88);
2039 SetNextPaState(pa
, PaLineStrip1
, PaLineStripSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
2043 #if ENABLE_AVX512_SIMD16
2044 bool PaLineStrip0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
2046 SetNextPaState_simd16(pa
, PaLineStrip1_simd16
, PaLineStrip1
, PaLineStripSingle0
);
2047 return false; // Not enough vertices to assemble 16 lines
2050 bool PaLineStrip1_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
2052 const simd16scalari perm
= _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
2054 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
2055 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
2057 simd16vector
&v0
= verts
[0];
2058 simd16vector
&v1
= verts
[1];
2060 // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2061 // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2063 v0
= a
; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2065 // for simd16 x, y, z, and w
2066 for (int i
= 0; i
< 4; i
+= 1)
2068 simd16scalar temp
= _simd16_blend_ps(a
[i
], b
[i
], 0x0001); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2070 v1
[i
] = _simd16_permute_ps(temp
, perm
); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2073 SetNextPaState_simd16(pa
, PaLineStrip1_simd16
, PaLineStrip1
, PaLineStripSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
);
2078 void PaLineStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
2080 #if USE_SIMD16_FRONTEND
2081 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, pa
.prev
, slot
);
2082 const simd16vector
&b
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
2084 if (pa
.useAlternateOffset
)
2086 primIndex
+= KNOB_SIMD_WIDTH
;
2092 verts
[0] = swizzleLane0(a
);
2093 verts
[1] = swizzleLane1(a
);
2096 verts
[0] = swizzleLane1(a
);
2097 verts
[1] = swizzleLane2(a
);
2100 verts
[0] = swizzleLane2(a
);
2101 verts
[1] = swizzleLane3(a
);
2104 verts
[0] = swizzleLane3(a
);
2105 verts
[1] = swizzleLane4(a
);
2108 verts
[0] = swizzleLane4(a
);
2109 verts
[1] = swizzleLane5(a
);
2112 verts
[0] = swizzleLane5(a
);
2113 verts
[1] = swizzleLane6(a
);
2116 verts
[0] = swizzleLane6(a
);
2117 verts
[1] = swizzleLane7(a
);
2120 verts
[0] = swizzleLane7(a
);
2121 verts
[1] = swizzleLane8(a
);
2124 verts
[0] = swizzleLane8(a
);
2125 verts
[1] = swizzleLane9(a
);
2128 verts
[0] = swizzleLane9(a
);
2129 verts
[1] = swizzleLaneA(a
);
2132 verts
[0] = swizzleLaneA(a
);
2133 verts
[1] = swizzleLaneB(a
);
2136 verts
[0] = swizzleLaneB(a
);
2137 verts
[1] = swizzleLaneC(a
);
2140 verts
[0] = swizzleLaneC(a
);
2141 verts
[1] = swizzleLaneD(a
);
2144 verts
[0] = swizzleLaneD(a
);
2145 verts
[1] = swizzleLaneE(a
);
2148 verts
[0] = swizzleLaneE(a
);
2149 verts
[1] = swizzleLaneF(a
);
2152 verts
[0] = swizzleLaneF(a
);
2153 verts
[1] = swizzleLane0(b
);
2157 const simdvector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
2158 const simdvector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
2163 verts
[0] = swizzleLane0(a
);
2164 verts
[1] = swizzleLane1(a
);
2167 verts
[0] = swizzleLane1(a
);
2168 verts
[1] = swizzleLane2(a
);
2171 verts
[0] = swizzleLane2(a
);
2172 verts
[1] = swizzleLane3(a
);
2175 verts
[0] = swizzleLane3(a
);
2176 verts
[1] = swizzleLane4(a
);
2179 verts
[0] = swizzleLane4(a
);
2180 verts
[1] = swizzleLane5(a
);
2183 verts
[0] = swizzleLane5(a
);
2184 verts
[1] = swizzleLane6(a
);
2187 verts
[0] = swizzleLane6(a
);
2188 verts
[1] = swizzleLane7(a
);
2191 verts
[0] = swizzleLane7(a
);
2192 verts
[1] = swizzleLane0(b
);
2198 bool PaPoints0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
2200 #if USE_SIMD16_FRONTEND
2203 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
2205 if (!pa
.useAlternateOffset
)
2207 for (uint32_t i
= 0; i
< 4; i
+= 1)
2209 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
2214 for (uint32_t i
= 0; i
< 4; i
+= 1)
2216 a
[i
] = _simd16_extract_ps(a_16
[i
], 1);
2221 simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
2224 verts
[0] = a
; // points only have 1 vertex.
2226 SetNextPaState(pa
, PaPoints0
, PaPointsSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
2230 #if ENABLE_AVX512_SIMD16
2231 bool PaPoints0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
2233 simd16vector
&a
= PaGetSimdVector_simd16(pa
, pa
.cur
, slot
);
2235 verts
[0] = a
; // points only have 1 vertex.
2237 SetNextPaState_simd16(pa
, PaPoints0_simd16
, PaPoints0
, PaPointsSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
2242 void PaPointsSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
2244 #if USE_SIMD16_FRONTEND
2245 const simd16vector
&a
= PaGetSimdVector_simd16(pa
, 0, slot
);
2247 if (pa
.useAlternateOffset
)
2249 primIndex
+= KNOB_SIMD_WIDTH
;
2252 verts
[0] = swizzleLaneN(a
, primIndex
);
2254 const simdvector
&a
= PaGetSimdVector(pa
, 0, slot
);
2256 verts
[0] = swizzleLaneN(a
, primIndex
);
2260 //////////////////////////////////////////////////////////////////////////
2261 /// @brief State 1 for RECT_LIST topology.
2262 /// There is not enough to assemble 8 triangles.
2263 bool PaRectList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
2265 SetNextPaState(pa
, PaRectList1
, PaRectListSingle0
);
2269 //////////////////////////////////////////////////////////////////////////
2270 /// @brief State 1 for RECT_LIST topology.
2271 /// Rect lists has the following format.
2273 /// v2 o---o v5 o---o v8 o---o v11 o---o
2274 /// | \ | | \ | | \ | | \ |
2275 /// v1 o---o v4 o---o v7 o---o v10 o---o
2278 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2280 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2281 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2284 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2285 /// where v0 contains all the first vertices for 8 triangles.
2288 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2289 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2290 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
2292 /// @param pa - State for PA state machine.
2293 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2294 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
2300 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
2301 #if USE_SIMD16_FRONTEND
2305 if (!pa
.useAlternateOffset
)
2307 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
2309 for (uint32_t i
= 0; i
< 4; i
+= 1)
2311 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
2312 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
2317 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
2319 for (uint32_t i
= 0; i
< 4; i
+= 1)
2321 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
2322 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);;
2327 simdvector
&a
= PaGetSimdVector(pa
, 0, slot
); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
2328 simdvector
&b
= PaGetSimdVector(pa
, 1, slot
); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
2331 __m256 tmp0
, tmp1
, tmp2
;
2333 // Loop over each component in the simdvector.
2334 for(int i
= 0; i
< 4; ++i
)
2336 simdvector
& v0
= verts
[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2337 tmp0
= _mm256_permute2f128_ps(b
[i
], b
[i
], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2338 v0
[i
] = _mm256_blend_ps(a
[i
], tmp0
, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
2339 tmp1
= _mm256_permute_ps(v0
[i
], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
2340 v0
[i
] = _mm256_permute_ps(v0
[i
], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
2341 v0
[i
] = _mm256_blend_ps(tmp1
, v0
[i
], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
2343 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2344 /// AVX2 should make this much cheaper.
2345 simdvector
& v1
= verts
[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2346 v1
[i
] = _mm256_permute_ps(a
[i
], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
2347 tmp1
= _mm256_permute_ps(a
[i
], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
2348 tmp2
= _mm256_blend_ps(v1
[i
], tmp1
, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
2349 tmp1
= _mm256_permute2f128_ps(tmp2
, tmp2
, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
2350 v1
[i
] = _mm256_permute_ps(tmp0
, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
2351 v1
[i
] = _mm256_blend_ps(tmp2
, v1
[i
], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
2352 v1
[i
] = _mm256_blend_ps(v1
[i
], tmp1
, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2354 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
2355 simdvector
& v2
= verts
[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2356 v2
[i
] = _mm256_permute_ps(tmp0
, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
2357 tmp1
= _mm256_permute_ps(tmp2
, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
2358 v2
[i
] = _mm256_blend_ps(tmp1
, v2
[i
], 0xF0);
2360 // Need to compute 4th implied vertex for the rectangle.
2361 tmp2
= _mm256_sub_ps(v0
[i
], v1
[i
]);
2362 tmp2
= _mm256_add_ps(tmp2
, v2
[i
]); // tmp2 = { w, *, x, *, y, *, z, * }
2363 tmp2
= _mm256_permute_ps(tmp2
, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
2364 v2
[i
] = _mm256_blend_ps(v2
[i
], tmp2
, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
2367 SetNextPaState(pa
, PaRectList1
, PaRectListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
2371 //////////////////////////////////////////////////////////////////////////
2372 /// @brief State 2 for RECT_LIST topology.
2373 /// Not implemented unless there is a use case for more then 8 rects.
2374 /// @param pa - State for PA state machine.
2375 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2376 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
2382 SWR_INVALID("Is rect list used for anything other then clears?");
2383 SetNextPaState(pa
, PaRectList0
, PaRectListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
2387 #if ENABLE_AVX512_SIMD16
2388 //////////////////////////////////////////////////////////////////////////
2389 /// @brief State 1 for RECT_LIST topology.
2390 /// There is not enough to assemble 8 triangles.
2391 bool PaRectList0_simd16(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[])
2393 SetNextPaState_simd16(pa
, PaRectList1_simd16
, PaRectList1
, PaRectListSingle0
);
2397 //////////////////////////////////////////////////////////////////////////
2398 /// @brief State 1 for RECT_LIST topology.
2399 /// Rect lists has the following format.
2401 /// v2 o---o v5 o---o v8 o---o v11 o---o
2402 /// | \ | | \ | | \ | | \ |
2403 /// v1 o---o v4 o---o v7 o---o v10 o---o
2406 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2408 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2409 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2412 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2413 /// where v0 contains all the first vertices for 8 triangles.
2416 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2417 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2418 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
2420 /// @param pa - State for PA state machine.
2421 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2422 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
2423 bool PaRectList1_simd16(
2426 simd16vector verts
[])
2431 if (!pa
.useAlternateOffset
)
2433 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 }
2435 for (uint32_t i
= 0; i
< 4; i
+= 1)
2437 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
2438 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
2443 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
); // b[] = { v16...but not used by this implementation.. }
2445 for (uint32_t i
= 0; i
< 4; i
+= 1)
2447 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
2448 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);
2452 simd16vector
&v0
= verts
[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2453 simd16vector
&v1
= verts
[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2454 simd16vector
&v2
= verts
[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2456 // Loop over each component in the simdvector.
2457 for (int i
= 0; i
< 4; i
+= 1)
2459 simdscalar v0_lo
; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2460 simdscalar v1_lo
; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2461 simdscalar v2_lo
; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
2463 __m256 tmp0
, tmp1
, tmp2
;
2465 tmp0
= _mm256_permute2f128_ps(b
[i
], b
[i
], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2466 v0_lo
= _mm256_blend_ps(a
[i
], tmp0
, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
2467 tmp1
= _mm256_permute_ps(v0_lo
, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
2468 v0_lo
= _mm256_permute_ps(v0_lo
, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
2469 v0_lo
= _mm256_blend_ps(tmp1
, v0_lo
, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
2471 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2472 /// AVX2 should make this much cheaper.
2473 v1_lo
= _mm256_permute_ps(a
[i
], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
2474 tmp1
= _mm256_permute_ps(a
[i
], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
2475 tmp2
= _mm256_blend_ps(v1_lo
, tmp1
, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
2476 tmp1
= _mm256_permute2f128_ps(tmp2
, tmp2
, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
2477 v1_lo
= _mm256_permute_ps(tmp0
, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
2478 v1_lo
= _mm256_blend_ps(tmp2
, v1_lo
, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
2479 v1_lo
= _mm256_blend_ps(v1_lo
, tmp1
, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2481 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
2482 v2_lo
= _mm256_permute_ps(tmp0
, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
2483 tmp1
= _mm256_permute_ps(tmp2
, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
2484 v2_lo
= _mm256_blend_ps(tmp1
, v2_lo
, 0xF0);
2486 // Need to compute 4th implied vertex for the rectangle.
2487 tmp2
= _mm256_sub_ps(v0_lo
, v1_lo
);
2488 tmp2
= _mm256_add_ps(tmp2
, v2_lo
); // tmp2 = { w, *, x, *, y, *, z, * }
2489 tmp2
= _mm256_permute_ps(tmp2
, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
2490 v2_lo
= _mm256_blend_ps(v2_lo
, tmp2
, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
2492 v0
[i
] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo
, 0);
2493 v1
[i
] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo
, 0);
2494 v2
[i
] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo
, 0);
2497 SetNextPaState_simd16(pa
, PaRectList1_simd16
, PaRectList1
, PaRectListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
2501 //////////////////////////////////////////////////////////////////////////
2502 /// @brief State 2 for RECT_LIST topology.
2503 /// Not implemented unless there is a use case for more then 8 rects.
2504 /// @param pa - State for PA state machine.
2505 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2506 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
2507 bool PaRectList2_simd16(
2510 simd16vector verts
[])
2512 SWR_INVALID("Is rect list used for anything other then clears?");
2513 SetNextPaState_simd16(pa
, PaRectList0_simd16
, PaRectList0
, PaRectListSingle0
, 0, PA_STATE_OPT::SIMD_WIDTH
, true);
2518 //////////////////////////////////////////////////////////////////////////
2519 /// @brief This procedure is called by the Binner to assemble the attributes.
2520 /// Unlike position, which is stored vertically, the attributes are
2521 /// stored horizontally. The outputs from the VS, labeled as 'a' and
2522 /// 'b' are vertical. This function needs to transpose the lanes
2523 /// containing the vertical attribute data into horizontal form.
2524 /// @param pa - State for PA state machine.
2525 /// @param slot - Index into VS output for a given attribute.
2526 /// @param primIndex - Binner processes each triangle individually.
2527 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
2528 void PaRectListSingle0(
2534 // We have 12 simdscalars contained within 3 simdvectors which
2535 // hold at least 8 triangles worth of data. We want to assemble a single
2536 // triangle with data in horizontal form.
2537 #if USE_SIMD16_FRONTEND
2541 if (!pa
.useAlternateOffset
)
2543 const simd16vector
&a_16
= PaGetSimdVector_simd16(pa
, 0, slot
);
2545 for (uint32_t i
= 0; i
< 4; i
+= 1)
2547 a
[i
] = _simd16_extract_ps(a_16
[i
], 0);
2548 b
[i
] = _simd16_extract_ps(a_16
[i
], 1);
2553 const simd16vector
&b_16
= PaGetSimdVector_simd16(pa
, 1, slot
);
2555 for (uint32_t i
= 0; i
< 4; i
+= 1)
2557 a
[i
] = _simd16_extract_ps(b_16
[i
], 0);
2558 b
[i
] = _simd16_extract_ps(b_16
[i
], 1);;
2563 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
2566 // Convert from vertical to horizontal.
2570 verts
[0] = swizzleLane0(a
);
2571 verts
[1] = swizzleLane1(a
);
2572 verts
[2] = swizzleLane2(a
);
2575 verts
[0] = swizzleLane0(a
);
2576 verts
[1] = swizzleLane2(a
);
2577 verts
[2] = _mm_blend_ps(verts
[0], verts
[1], 0xA);
2585 SWR_INVALID("Invalid primIndex: %d", primIndex
);
2590 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT
*in_pDC
, uint32_t in_numPrims
, uint8_t* pStream
, uint32_t in_streamSizeInVerts
,
2591 uint32_t in_vertexStride
, bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
) :
2592 PA_STATE(in_pDC
, pStream
, in_streamSizeInVerts
, in_vertexStride
), numPrims(in_numPrims
), numPrimsComplete(0), numSimdPrims(0),
2593 cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming
)
2595 const API_STATE
& state
= GetApiState(pDC
);
2597 this->binTopology
= topo
== TOP_UNKNOWN
? state
.topology
: topo
;
2599 #if ENABLE_AVX512_SIMD16
2600 pfnPaFunc_simd16
= nullptr;
2603 switch (this->binTopology
)
2605 case TOP_TRIANGLE_LIST
:
2606 this->pfnPaFunc
= PaTriList0
;
2607 #if ENABLE_AVX512_SIMD16
2608 this->pfnPaFunc_simd16
= PaTriList0_simd16
;
2611 case TOP_TRIANGLE_STRIP
:
2612 this->pfnPaFunc
= PaTriStrip0
;
2613 #if ENABLE_AVX512_SIMD16
2614 this->pfnPaFunc_simd16
= PaTriStrip0_simd16
;
2617 case TOP_TRIANGLE_FAN
:
2618 this->pfnPaFunc
= PaTriFan0
;
2619 #if ENABLE_AVX512_SIMD16
2620 this->pfnPaFunc_simd16
= PaTriFan0_simd16
;
2624 this->pfnPaFunc
= PaQuadList0
;
2625 #if ENABLE_AVX512_SIMD16
2626 this->pfnPaFunc_simd16
= PaQuadList0_simd16
;
2628 this->numPrims
= in_numPrims
* 2; // Convert quad primitives into triangles
2630 case TOP_QUAD_STRIP
:
2631 // quad strip pattern when decomposed into triangles is the same as verts strips
2632 this->pfnPaFunc
= PaTriStrip0
;
2633 #if ENABLE_AVX512_SIMD16
2634 this->pfnPaFunc_simd16
= PaTriStrip0_simd16
;
2636 this->numPrims
= in_numPrims
* 2; // Convert quad primitives into triangles
2639 this->pfnPaFunc
= PaLineList0
;
2640 #if ENABLE_AVX512_SIMD16
2641 this->pfnPaFunc_simd16
= PaLineList0_simd16
;
2643 this->numPrims
= in_numPrims
;
2645 case TOP_LINE_STRIP
:
2646 this->pfnPaFunc
= PaLineStrip0
;
2647 #if ENABLE_AVX512_SIMD16
2648 this->pfnPaFunc_simd16
= PaLineStrip0_simd16
;
2650 this->numPrims
= in_numPrims
;
2653 this->pfnPaFunc
= PaLineLoop0
;
2654 #if ENABLE_AVX512_SIMD16
2655 this->pfnPaFunc_simd16
= PaLineLoop0_simd16
;
2657 this->numPrims
= in_numPrims
;
2659 case TOP_POINT_LIST
:
2660 this->pfnPaFunc
= PaPoints0
;
2661 #if ENABLE_AVX512_SIMD16
2662 this->pfnPaFunc_simd16
= PaPoints0_simd16
;
2664 this->numPrims
= in_numPrims
;
2667 this->pfnPaFunc
= PaRectList0
;
2668 #if ENABLE_AVX512_SIMD16
2669 this->pfnPaFunc_simd16
= PaRectList0_simd16
;
2671 this->numPrims
= in_numPrims
* 2;
2674 case TOP_PATCHLIST_1
:
2675 this->pfnPaFunc
= PaPatchList
<1>;
2676 #if ENABLE_AVX512_SIMD16
2677 this->pfnPaFunc_simd16
= PaPatchList_simd16
<1>;
2680 case TOP_PATCHLIST_2
:
2681 this->pfnPaFunc
= PaPatchList
<2>;
2682 #if ENABLE_AVX512_SIMD16
2683 this->pfnPaFunc_simd16
= PaPatchList_simd16
<2>;
2686 case TOP_PATCHLIST_3
:
2687 this->pfnPaFunc
= PaPatchList
<3>;
2688 #if ENABLE_AVX512_SIMD16
2689 this->pfnPaFunc_simd16
= PaPatchList_simd16
<3>;
2692 case TOP_PATCHLIST_4
:
2693 this->pfnPaFunc
= PaPatchList
<4>;
2694 #if ENABLE_AVX512_SIMD16
2695 this->pfnPaFunc_simd16
= PaPatchList_simd16
<4>;
2698 case TOP_PATCHLIST_5
:
2699 this->pfnPaFunc
= PaPatchList
<5>;
2700 #if ENABLE_AVX512_SIMD16
2701 this->pfnPaFunc_simd16
= PaPatchList_simd16
<5>;
2704 case TOP_PATCHLIST_6
:
2705 this->pfnPaFunc
= PaPatchList
<6>;
2706 #if ENABLE_AVX512_SIMD16
2707 this->pfnPaFunc_simd16
= PaPatchList_simd16
<6>;
2710 case TOP_PATCHLIST_7
:
2711 this->pfnPaFunc
= PaPatchList
<7>;
2712 #if ENABLE_AVX512_SIMD16
2713 this->pfnPaFunc_simd16
= PaPatchList_simd16
<7>;
2716 case TOP_PATCHLIST_8
:
2717 this->pfnPaFunc
= PaPatchList
<8>;
2718 #if ENABLE_AVX512_SIMD16
2719 this->pfnPaFunc_simd16
= PaPatchList_simd16
<8>;
2722 case TOP_PATCHLIST_9
:
2723 this->pfnPaFunc
= PaPatchList
<9>;
2724 #if ENABLE_AVX512_SIMD16
2725 this->pfnPaFunc_simd16
= PaPatchList_simd16
<9>;
2728 case TOP_PATCHLIST_10
:
2729 this->pfnPaFunc
= PaPatchList
<10>;
2730 #if ENABLE_AVX512_SIMD16
2731 this->pfnPaFunc_simd16
= PaPatchList_simd16
<10>;
2734 case TOP_PATCHLIST_11
:
2735 this->pfnPaFunc
= PaPatchList
<11>;
2736 #if ENABLE_AVX512_SIMD16
2737 this->pfnPaFunc_simd16
= PaPatchList_simd16
<11>;
2740 case TOP_PATCHLIST_12
:
2741 this->pfnPaFunc
= PaPatchList
<12>;
2742 #if ENABLE_AVX512_SIMD16
2743 this->pfnPaFunc_simd16
= PaPatchList_simd16
<12>;
2746 case TOP_PATCHLIST_13
:
2747 this->pfnPaFunc
= PaPatchList
<13>;
2748 #if ENABLE_AVX512_SIMD16
2749 this->pfnPaFunc_simd16
= PaPatchList_simd16
<13>;
2752 case TOP_PATCHLIST_14
:
2753 this->pfnPaFunc
= PaPatchList
<14>;
2754 #if ENABLE_AVX512_SIMD16
2755 this->pfnPaFunc_simd16
= PaPatchList_simd16
<14>;
2758 case TOP_PATCHLIST_15
:
2759 this->pfnPaFunc
= PaPatchList
<15>;
2760 #if ENABLE_AVX512_SIMD16
2761 this->pfnPaFunc_simd16
= PaPatchList_simd16
<15>;
2764 case TOP_PATCHLIST_16
:
2765 this->pfnPaFunc
= PaPatchList
<16>;
2766 #if ENABLE_AVX512_SIMD16
2767 this->pfnPaFunc_simd16
= PaPatchList_simd16
<16>;
2770 case TOP_PATCHLIST_17
:
2771 this->pfnPaFunc
= PaPatchList
<17>;
2772 #if ENABLE_AVX512_SIMD16
2773 this->pfnPaFunc_simd16
= PaPatchList_simd16
<17>;
2776 case TOP_PATCHLIST_18
:
2777 this->pfnPaFunc
= PaPatchList
<18>;
2778 #if ENABLE_AVX512_SIMD16
2779 this->pfnPaFunc_simd16
= PaPatchList_simd16
<18>;
2782 case TOP_PATCHLIST_19
:
2783 this->pfnPaFunc
= PaPatchList
<19>;
2784 #if ENABLE_AVX512_SIMD16
2785 this->pfnPaFunc_simd16
= PaPatchList_simd16
<19>;
2788 case TOP_PATCHLIST_20
:
2789 this->pfnPaFunc
= PaPatchList
<20>;
2790 #if ENABLE_AVX512_SIMD16
2791 this->pfnPaFunc_simd16
= PaPatchList_simd16
<20>;
2794 case TOP_PATCHLIST_21
:
2795 this->pfnPaFunc
= PaPatchList
<21>;
2796 #if ENABLE_AVX512_SIMD16
2797 this->pfnPaFunc_simd16
= PaPatchList_simd16
<21>;
2800 case TOP_PATCHLIST_22
:
2801 this->pfnPaFunc
= PaPatchList
<22>;
2802 #if ENABLE_AVX512_SIMD16
2803 this->pfnPaFunc_simd16
= PaPatchList_simd16
<22>;
2806 case TOP_PATCHLIST_23
:
2807 this->pfnPaFunc
= PaPatchList
<23>;
2808 #if ENABLE_AVX512_SIMD16
2809 this->pfnPaFunc_simd16
= PaPatchList_simd16
<23>;
2812 case TOP_PATCHLIST_24
:
2813 this->pfnPaFunc
= PaPatchList
<24>;
2814 #if ENABLE_AVX512_SIMD16
2815 this->pfnPaFunc_simd16
= PaPatchList_simd16
<24>;
2818 case TOP_PATCHLIST_25
:
2819 this->pfnPaFunc
= PaPatchList
<25>;
2820 #if ENABLE_AVX512_SIMD16
2821 this->pfnPaFunc_simd16
= PaPatchList_simd16
<25>;
2824 case TOP_PATCHLIST_26
:
2825 this->pfnPaFunc
= PaPatchList
<26>;
2826 #if ENABLE_AVX512_SIMD16
2827 this->pfnPaFunc_simd16
= PaPatchList_simd16
<26>;
2830 case TOP_PATCHLIST_27
:
2831 this->pfnPaFunc
= PaPatchList
<27>;
2832 #if ENABLE_AVX512_SIMD16
2833 this->pfnPaFunc_simd16
= PaPatchList_simd16
<27>;
2836 case TOP_PATCHLIST_28
:
2837 this->pfnPaFunc
= PaPatchList
<28>;
2838 #if ENABLE_AVX512_SIMD16
2839 this->pfnPaFunc_simd16
= PaPatchList_simd16
<28>;
2842 case TOP_PATCHLIST_29
:
2843 this->pfnPaFunc
= PaPatchList
<29>;
2844 #if ENABLE_AVX512_SIMD16
2845 this->pfnPaFunc_simd16
= PaPatchList_simd16
<29>;
2848 case TOP_PATCHLIST_30
:
2849 this->pfnPaFunc
= PaPatchList
<30>;
2850 #if ENABLE_AVX512_SIMD16
2851 this->pfnPaFunc_simd16
= PaPatchList_simd16
<30>;
2854 case TOP_PATCHLIST_31
:
2855 this->pfnPaFunc
= PaPatchList
<31>;
2856 #if ENABLE_AVX512_SIMD16
2857 this->pfnPaFunc_simd16
= PaPatchList_simd16
<31>;
2860 case TOP_PATCHLIST_32
:
2861 this->pfnPaFunc
= PaPatchList
<32>;
2862 #if ENABLE_AVX512_SIMD16
2863 this->pfnPaFunc_simd16
= PaPatchList_simd16
<32>;
2868 SWR_INVALID("Invalid topology: %d", this->binTopology
);
2872 this->pfnPaFuncReset
= this->pfnPaFunc
;
2873 #if ENABLE_AVX512_SIMD16
2874 this->pfnPaFuncReset_simd16
= this->pfnPaFunc_simd16
;
2877 #if USE_SIMD16_FRONTEND
2878 simd16scalari id16
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
2879 simd16scalari id82
= _simd16_set_epi32( 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
2882 simdscalari id8
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2883 simdscalari id4
= _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
2886 switch(this->binTopology
)
2888 case TOP_TRIANGLE_LIST
:
2889 case TOP_TRIANGLE_STRIP
:
2890 case TOP_TRIANGLE_FAN
:
2891 case TOP_LINE_STRIP
:
2894 #if USE_SIMD16_FRONTEND
2895 this->primIDIncr
= 16;
2896 this->primID
= id16
;
2898 this->primIDIncr
= 8;
2903 case TOP_QUAD_STRIP
:
2905 #if USE_SIMD16_FRONTEND
2906 this->primIDIncr
= 8;
2907 this->primID
= id82
;
2909 this->primIDIncr
= 4;
2913 case TOP_POINT_LIST
:
2914 #if USE_SIMD16_FRONTEND
2915 this->primIDIncr
= 16;
2916 this->primID
= id16
;
2918 this->primIDIncr
= 8;
2922 case TOP_PATCHLIST_1
:
2923 case TOP_PATCHLIST_2
:
2924 case TOP_PATCHLIST_3
:
2925 case TOP_PATCHLIST_4
:
2926 case TOP_PATCHLIST_5
:
2927 case TOP_PATCHLIST_6
:
2928 case TOP_PATCHLIST_7
:
2929 case TOP_PATCHLIST_8
:
2930 case TOP_PATCHLIST_9
:
2931 case TOP_PATCHLIST_10
:
2932 case TOP_PATCHLIST_11
:
2933 case TOP_PATCHLIST_12
:
2934 case TOP_PATCHLIST_13
:
2935 case TOP_PATCHLIST_14
:
2936 case TOP_PATCHLIST_15
:
2937 case TOP_PATCHLIST_16
:
2938 case TOP_PATCHLIST_17
:
2939 case TOP_PATCHLIST_18
:
2940 case TOP_PATCHLIST_19
:
2941 case TOP_PATCHLIST_20
:
2942 case TOP_PATCHLIST_21
:
2943 case TOP_PATCHLIST_22
:
2944 case TOP_PATCHLIST_23
:
2945 case TOP_PATCHLIST_24
:
2946 case TOP_PATCHLIST_25
:
2947 case TOP_PATCHLIST_26
:
2948 case TOP_PATCHLIST_27
:
2949 case TOP_PATCHLIST_28
:
2950 case TOP_PATCHLIST_29
:
2951 case TOP_PATCHLIST_30
:
2952 case TOP_PATCHLIST_31
:
2953 case TOP_PATCHLIST_32
:
2954 // Always run KNOB_SIMD_WIDTH number of patches at a time.
2955 #if USE_SIMD16_FRONTEND
2956 this->primIDIncr
= 16;
2957 this->primID
= id16
;
2959 this->primIDIncr
= 8;
2965 SWR_INVALID("Invalid topology: %d", this->binTopology
);