1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
35 #if (KNOB_SIMD_WIDTH == 8)
37 bool PaTriList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
38 bool PaTriList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
39 bool PaTriList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
40 void PaTriListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
42 bool PaTriStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
43 bool PaTriStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
44 void PaTriStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
46 bool PaTriFan0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
47 bool PaTriFan1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
48 void PaTriFanSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
50 bool PaQuadList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
51 bool PaQuadList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
52 void PaQuadListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
54 bool PaLineLoop0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
55 bool PaLineLoop1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
57 bool PaLineList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
58 bool PaLineList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
59 void PaLineListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t index
, __m128 verts
[]);
61 bool PaLineStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
62 bool PaLineStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
63 void PaLineStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 lineverts
[]);
65 bool PaPoints0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
66 void PaPointsSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
68 bool PaRectList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
69 bool PaRectList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
70 bool PaRectList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
71 void PaRectListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
73 template <uint32_t TotalControlPoints
>
74 void PaPatchListSingle(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
76 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
77 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
78 // Each attribute has 4 components.
80 /// @todo Optimize this
82 float* pOutVec
= (float*)verts
;
84 for (uint32_t cp
= 0; cp
< TotalControlPoints
; ++cp
)
86 uint32_t input_cp
= primIndex
* TotalControlPoints
+ cp
;
87 uint32_t input_vec
= input_cp
/ KNOB_SIMD_WIDTH
;
88 uint32_t input_lane
= input_cp
% KNOB_SIMD_WIDTH
;
90 // Loop over all components of the attribute
91 for (uint32_t i
= 0; i
< 4; ++i
)
93 const float* pInputVec
= (const float*)(&PaGetSimdVector(pa
, input_vec
, slot
)[i
]);
94 pOutVec
[cp
* 4 + i
] = pInputVec
[input_lane
];
99 template<uint32_t TotalControlPoints
, uint32_t CurrentControlPoints
= 1>
100 static bool PaPatchList(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
104 PaPatchList
<TotalControlPoints
, CurrentControlPoints
+ 1>,
105 PaPatchListSingle
<TotalControlPoints
>);
110 template<uint32_t TotalControlPoints
>
111 static bool PaPatchListTerm(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
113 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
114 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
115 // Each attribute has 4 components.
117 /// @todo Optimize this
119 // Loop over all components of the attribute
120 for (uint32_t i
= 0; i
< 4; ++i
)
122 for (uint32_t cp
= 0; cp
< TotalControlPoints
; ++cp
)
124 float vec
[KNOB_SIMD_WIDTH
];
125 for (uint32_t lane
= 0; lane
< KNOB_SIMD_WIDTH
; ++lane
)
127 uint32_t input_cp
= lane
* TotalControlPoints
+ cp
;
128 uint32_t input_vec
= input_cp
/ KNOB_SIMD_WIDTH
;
129 uint32_t input_lane
= input_cp
% KNOB_SIMD_WIDTH
;
131 const float* pInputVec
= (const float*)(&PaGetSimdVector(pa
, input_vec
, slot
)[i
]);
132 vec
[lane
] = pInputVec
[input_lane
];
134 verts
[cp
][i
] = _simd_loadu_ps(vec
);
140 PaPatchList
<TotalControlPoints
>,
141 PaPatchListSingle
<TotalControlPoints
>,
149 #define PA_PATCH_LIST_TERMINATOR(N) \
150 template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
151 { return PaPatchListTerm<N>(pa, slot, verts); }
152 PA_PATCH_LIST_TERMINATOR(1)
153 PA_PATCH_LIST_TERMINATOR(2)
154 PA_PATCH_LIST_TERMINATOR(3)
155 PA_PATCH_LIST_TERMINATOR(4)
156 PA_PATCH_LIST_TERMINATOR(5)
157 PA_PATCH_LIST_TERMINATOR(6)
158 PA_PATCH_LIST_TERMINATOR(7)
159 PA_PATCH_LIST_TERMINATOR(8)
160 PA_PATCH_LIST_TERMINATOR(9)
161 PA_PATCH_LIST_TERMINATOR(10)
162 PA_PATCH_LIST_TERMINATOR(11)
163 PA_PATCH_LIST_TERMINATOR(12)
164 PA_PATCH_LIST_TERMINATOR(13)
165 PA_PATCH_LIST_TERMINATOR(14)
166 PA_PATCH_LIST_TERMINATOR(15)
167 PA_PATCH_LIST_TERMINATOR(16)
168 PA_PATCH_LIST_TERMINATOR(17)
169 PA_PATCH_LIST_TERMINATOR(18)
170 PA_PATCH_LIST_TERMINATOR(19)
171 PA_PATCH_LIST_TERMINATOR(20)
172 PA_PATCH_LIST_TERMINATOR(21)
173 PA_PATCH_LIST_TERMINATOR(22)
174 PA_PATCH_LIST_TERMINATOR(23)
175 PA_PATCH_LIST_TERMINATOR(24)
176 PA_PATCH_LIST_TERMINATOR(25)
177 PA_PATCH_LIST_TERMINATOR(26)
178 PA_PATCH_LIST_TERMINATOR(27)
179 PA_PATCH_LIST_TERMINATOR(28)
180 PA_PATCH_LIST_TERMINATOR(29)
181 PA_PATCH_LIST_TERMINATOR(30)
182 PA_PATCH_LIST_TERMINATOR(31)
183 PA_PATCH_LIST_TERMINATOR(32)
184 #undef PA_PATCH_LIST_TERMINATOR
186 bool PaTriList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
188 SetNextPaState(pa
, PaTriList1
, PaTriListSingle0
);
189 return false; // Not enough vertices to assemble 4 or 8 triangles.
192 bool PaTriList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
194 SetNextPaState(pa
, PaTriList2
, PaTriListSingle0
);
195 return false; // Not enough vertices to assemble 8 triangles.
198 bool PaTriList2(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
200 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
201 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
202 simdvector
& c
= PaGetSimdVector(pa
, 2, slot
);
205 // Tri Pattern - provoking vertex is always v0
206 // v0 -> 0 3 6 9 12 15 18 21
207 // v1 -> 1 4 7 10 13 16 19 22
208 // v2 -> 2 5 8 11 14 17 20 23
210 for(int i
= 0; i
< 4; ++i
)
212 simdvector
& v0
= verts
[0];
213 v0
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x92);
214 v0
[i
] = _simd_blend_ps(v0
[i
], c
[i
], 0x24);
215 v0
[i
] = _mm256_permute_ps(v0
[i
], 0x6C);
216 s
= _mm256_permute2f128_ps(v0
[i
], v0
[i
], 0x21);
217 v0
[i
] = _simd_blend_ps(v0
[i
], s
, 0x44);
219 simdvector
& v1
= verts
[1];
220 v1
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x24);
221 v1
[i
] = _simd_blend_ps(v1
[i
], c
[i
], 0x49);
222 v1
[i
] = _mm256_permute_ps(v1
[i
], 0xB1);
223 s
= _mm256_permute2f128_ps(v1
[i
], v1
[i
], 0x21);
224 v1
[i
] = _simd_blend_ps(v1
[i
], s
, 0x66);
226 simdvector
& v2
= verts
[2];
227 v2
[i
] = _simd_blend_ps(a
[i
], b
[i
], 0x49);
228 v2
[i
] = _simd_blend_ps(v2
[i
], c
[i
], 0x92);
229 v2
[i
] = _mm256_permute_ps(v2
[i
], 0xC6);
230 s
= _mm256_permute2f128_ps(v2
[i
], v2
[i
], 0x21);
231 v2
[i
] = _simd_blend_ps(v2
[i
], s
, 0x22);
234 SetNextPaState(pa
, PaTriList0
, PaTriListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
238 void PaTriListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
240 // We have 12 simdscalars contained within 3 simdvectors which
241 // hold at least 8 triangles worth of data. We want to assemble a single
242 // triangle with data in horizontal form.
243 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
244 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
245 simdvector
& c
= PaGetSimdVector(pa
, 2, slot
);
247 // Convert from vertical to horizontal.
248 // Tri Pattern - provoking vertex is always v0
249 // v0 -> 0 3 6 9 12 15 18 21
250 // v1 -> 1 4 7 10 13 16 19 22
251 // v2 -> 2 5 8 11 14 17 20 23
255 verts
[0] = swizzleLane0(a
);
256 verts
[1] = swizzleLane1(a
);
257 verts
[2] = swizzleLane2(a
);
260 verts
[0] = swizzleLane3(a
);
261 verts
[1] = swizzleLane4(a
);
262 verts
[2] = swizzleLane5(a
);
265 verts
[0] = swizzleLane6(a
);
266 verts
[1] = swizzleLane7(a
);
267 verts
[2] = swizzleLane0(b
);
270 verts
[0] = swizzleLane1(b
);
271 verts
[1] = swizzleLane2(b
);
272 verts
[2] = swizzleLane3(b
);
275 verts
[0] = swizzleLane4(b
);
276 verts
[1] = swizzleLane5(b
);
277 verts
[2] = swizzleLane6(b
);
280 verts
[0] = swizzleLane7(b
);
281 verts
[1] = swizzleLane0(c
);
282 verts
[2] = swizzleLane1(c
);
285 verts
[0] = swizzleLane2(c
);
286 verts
[1] = swizzleLane3(c
);
287 verts
[2] = swizzleLane4(c
);
290 verts
[0] = swizzleLane5(c
);
291 verts
[1] = swizzleLane6(c
);
292 verts
[2] = swizzleLane7(c
);
297 bool PaTriStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
299 SetNextPaState(pa
, PaTriStrip1
, PaTriStripSingle0
);
300 return false; // Not enough vertices to assemble 8 triangles.
303 bool PaTriStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
305 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
306 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
309 for(int i
= 0; i
< 4; ++i
)
311 simdscalar a0
= a
[i
];
312 simdscalar b0
= b
[i
];
314 // Tri Pattern - provoking vertex is always v0
318 simdvector
& v0
= verts
[0];
322 s
= _mm256_permute2f128_ps(a0
, b0
, 0x21);
324 s
= _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(1, 0, 3, 2));
326 simdvector
& v1
= verts
[1];
328 v1
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(3, 1, 3, 1));
330 simdvector
& v2
= verts
[2];
332 v2
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(2, 2, 2, 2));
335 SetNextPaState(pa
, PaTriStrip1
, PaTriStripSingle0
, 0, KNOB_SIMD_WIDTH
);
339 void PaTriStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
341 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
342 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
344 // Convert from vertical to horizontal.
345 // Tri Pattern - provoking vertex is always v0
352 verts
[0] = swizzleLane0(a
);
353 verts
[1] = swizzleLane1(a
);
354 verts
[2] = swizzleLane2(a
);
357 verts
[0] = swizzleLane1(a
);
358 verts
[1] = swizzleLane3(a
);
359 verts
[2] = swizzleLane2(a
);
362 verts
[0] = swizzleLane2(a
);
363 verts
[1] = swizzleLane3(a
);
364 verts
[2] = swizzleLane4(a
);
367 verts
[0] = swizzleLane3(a
);
368 verts
[1] = swizzleLane5(a
);
369 verts
[2] = swizzleLane4(a
);
372 verts
[0] = swizzleLane4(a
);
373 verts
[1] = swizzleLane5(a
);
374 verts
[2] = swizzleLane6(a
);
377 verts
[0] = swizzleLane5(a
);
378 verts
[1] = swizzleLane7(a
);
379 verts
[2] = swizzleLane6(a
);
382 verts
[0] = swizzleLane6(a
);
383 verts
[1] = swizzleLane7(a
);
384 verts
[2] = swizzleLane0(b
);
387 verts
[0] = swizzleLane7(a
);
388 verts
[1] = swizzleLane1(b
);
389 verts
[2] = swizzleLane0(b
);
394 bool PaTriFan0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
396 simdvector
& a
= PaGetSimdVector(pa
, pa
.cur
, slot
);
398 // Extract vertex 0 to every lane of first vector
399 for(int i
= 0; i
< 4; ++i
)
402 simdvector
& v0
= verts
[0];
403 v0
[i
] = _simd_shuffle_ps(a0
, a0
, _MM_SHUFFLE(0, 0, 0, 0));
404 v0
[i
] = _mm256_permute2f128_ps(v0
[i
], a0
, 0x00);
407 // store off leading vertex for attributes
408 simdvertex
* pVertex
= (simdvertex
*)pa
.pStreamBase
;
409 pa
.leadingVertex
= pVertex
[pa
.cur
];
411 SetNextPaState(pa
, PaTriFan1
, PaTriFanSingle0
);
412 return false; // Not enough vertices to assemble 8 triangles.
415 bool PaTriFan1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
417 simdvector
& leadVert
= pa
.leadingVertex
.attrib
[slot
];
418 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
419 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
422 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
423 for(int i
= 0; i
< 4; ++i
)
425 simdscalar a0
= a
[i
];
426 simdscalar b0
= b
[i
];
428 __m256 comp
= leadVert
[i
];
429 simdvector
& v0
= verts
[0];
430 v0
[i
] = _simd_shuffle_ps(comp
, comp
, _MM_SHUFFLE(0, 0, 0, 0));
431 v0
[i
] = _mm256_permute2f128_ps(v0
[i
], comp
, 0x00);
433 simdvector
& v2
= verts
[2];
434 s
= _mm256_permute2f128_ps(a0
, b0
, 0x21);
435 v2
[i
] = _simd_shuffle_ps(a0
, s
, _MM_SHUFFLE(1, 0, 3, 2));
437 simdvector
& v1
= verts
[1];
438 v1
[i
] = _simd_shuffle_ps(a0
, v2
[i
], _MM_SHUFFLE(2, 1, 2, 1));
441 SetNextPaState(pa
, PaTriFan1
, PaTriFanSingle0
, 0, KNOB_SIMD_WIDTH
);
445 void PaTriFanSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
447 // vert 0 from leading vertex
448 simdvector
& lead
= pa
.leadingVertex
.attrib
[slot
];
449 verts
[0] = swizzleLane0(lead
);
451 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
452 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
457 verts
[1] = swizzleLaneN(a
, primIndex
+ 1);
461 verts
[1] = swizzleLane0(b
);
467 verts
[2] = swizzleLaneN(a
, primIndex
+ 2);
471 verts
[2] = swizzleLaneN(b
, primIndex
- 6);
475 bool PaQuadList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
477 SetNextPaState(pa
, PaQuadList1
, PaQuadListSingle0
);
478 return false; // Not enough vertices to assemble 8 triangles.
481 bool PaQuadList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
483 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
484 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
487 for(int i
= 0; i
< 4; ++i
)
489 simdscalar a0
= a
[i
];
490 simdscalar b0
= b
[i
];
492 s1
= _mm256_permute2f128_ps(a0
, b0
, 0x20);
493 s2
= _mm256_permute2f128_ps(a0
, b0
, 0x31);
495 simdvector
& v0
= verts
[0];
496 v0
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(0, 0, 0, 0));
498 simdvector
& v1
= verts
[1];
499 v1
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(2, 1, 2, 1));
501 simdvector
& v2
= verts
[2];
502 v2
[i
] = _simd_shuffle_ps(s1
, s2
, _MM_SHUFFLE(3, 2, 3, 2));
505 SetNextPaState(pa
, PaQuadList0
, PaQuadListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
509 void PaQuadListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
511 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
512 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
517 // triangle 0 - 0 1 2
518 verts
[0] = swizzleLane0(a
);
519 verts
[1] = swizzleLane1(a
);
520 verts
[2] = swizzleLane2(a
);
524 // triangle 1 - 0 2 3
525 verts
[0] = swizzleLane0(a
);
526 verts
[1] = swizzleLane2(a
);
527 verts
[2] = swizzleLane3(a
);
531 // triangle 2 - 4 5 6
532 verts
[0] = swizzleLane4(a
);
533 verts
[1] = swizzleLane5(a
);
534 verts
[2] = swizzleLane6(a
);
538 // triangle 3 - 4 6 7
539 verts
[0] = swizzleLane4(a
);
540 verts
[1] = swizzleLane6(a
);
541 verts
[2] = swizzleLane7(a
);
545 // triangle 4 - 8 9 10 (0 1 2)
546 verts
[0] = swizzleLane0(b
);
547 verts
[1] = swizzleLane1(b
);
548 verts
[2] = swizzleLane2(b
);
552 // triangle 1 - 0 2 3
553 verts
[0] = swizzleLane0(b
);
554 verts
[1] = swizzleLane2(b
);
555 verts
[2] = swizzleLane3(b
);
559 // triangle 2 - 4 5 6
560 verts
[0] = swizzleLane4(b
);
561 verts
[1] = swizzleLane5(b
);
562 verts
[2] = swizzleLane6(b
);
566 // triangle 3 - 4 6 7
567 verts
[0] = swizzleLane4(b
);
568 verts
[1] = swizzleLane6(b
);
569 verts
[2] = swizzleLane7(b
);
574 void PaLineLoopSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t lineIndex
, __m128 verts
[])
576 PaLineStripSingle0(pa
, slot
, lineIndex
, verts
);
578 if (pa
.numPrimsComplete
+ lineIndex
== pa
.numPrims
- 1) {
579 simdvector
&start
= PaGetSimdVector(pa
, pa
.first
, slot
);
580 verts
[1] = swizzleLane0(start
);
584 bool PaLineLoop0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
586 SetNextPaState(pa
, PaLineLoop1
, PaLineLoopSingle0
);
590 bool PaLineLoop1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
592 PaLineStrip1(pa
, slot
, verts
);
594 if (pa
.numPrimsComplete
+ KNOB_SIMD_WIDTH
> pa
.numPrims
- 1) {
595 // loop reconnect now
596 int lane
= pa
.numPrims
- pa
.numPrimsComplete
- 1;
597 simdvector
&start
= PaGetSimdVector(pa
, pa
.first
, slot
);
598 for (int i
= 0; i
< 4; i
++) {
599 float *startVtx
= (float *)&(start
[i
]);
600 float *targetVtx
= (float *)&(verts
[1][i
]);
601 targetVtx
[lane
] = startVtx
[0];
605 SetNextPaState(pa
, PaLineLoop1
, PaLineLoopSingle0
, 0, KNOB_SIMD_WIDTH
);
610 bool PaLineList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
612 SetNextPaState(pa
, PaLineList1
, PaLineListSingle0
);
613 return false; // Not enough vertices to assemble 8 lines
616 bool PaLineList1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
618 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
619 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
);
620 /// @todo: verify provoking vertex is correct
621 // Line list 0 1 2 3 4 5 6 7
622 // 8 9 10 11 12 13 14 15
625 // 0 2 4 6 8 10 12 14
626 // 1 3 5 7 9 11 13 15
628 for (uint32_t i
= 0; i
< 4; ++i
)
631 __m256 vALowBLow
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x20);
632 // 4 5 6 7 12 13 14 15
633 __m256 vAHighBHigh
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x31);
635 // 0 2 4 6 8 10 12 14
636 verts
[0].v
[i
] = _mm256_shuffle_ps(vALowBLow
, vAHighBHigh
, _MM_SHUFFLE(2, 0, 2, 0));
637 // 1 3 5 7 9 11 13 15
638 verts
[1].v
[i
] = _mm256_shuffle_ps(vALowBLow
, vAHighBHigh
, _MM_SHUFFLE(3, 1, 3, 1));
641 SetNextPaState(pa
, PaLineList0
, PaLineListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
645 void PaLineListSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
647 simdvector
&a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
648 simdvector
&b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
653 verts
[0] = swizzleLane0(a
);
654 verts
[1] = swizzleLane1(a
);
657 verts
[0] = swizzleLane2(a
);
658 verts
[1] = swizzleLane3(a
);
661 verts
[0] = swizzleLane4(a
);
662 verts
[1] = swizzleLane5(a
);
665 verts
[0] = swizzleLane6(a
);
666 verts
[1] = swizzleLane7(a
);
669 verts
[0] = swizzleLane0(b
);
670 verts
[1] = swizzleLane1(b
);
673 verts
[0] = swizzleLane2(b
);
674 verts
[1] = swizzleLane3(b
);
677 verts
[0] = swizzleLane4(b
);
678 verts
[1] = swizzleLane5(b
);
681 verts
[0] = swizzleLane6(b
);
682 verts
[1] = swizzleLane7(b
);
687 bool PaLineStrip0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
689 SetNextPaState(pa
, PaLineStrip1
, PaLineStripSingle0
);
690 return false; // Not enough vertices to assemble 8 lines
693 bool PaLineStrip1(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
695 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
696 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
698 /// @todo: verify provoking vertex is correct
699 // Line list 0 1 2 3 4 5 6 7
700 // 8 9 10 11 12 13 14 15
708 for(uint32_t i
= 0; i
< 4; ++i
)
711 __m256 vPermA
= _mm256_permute_ps(a
.v
[i
], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
713 __m256 vAHighBLow
= _mm256_permute2f128_ps(a
.v
[i
], b
.v
[i
], 0x21);
716 __m256 vPermB
= _mm256_permute_ps(vAHighBLow
, 0); // indices hi->low (0 0 0 0)
718 verts
[1].v
[i
] = _mm256_blend_ps(vPermA
, vPermB
, 0x88);
721 SetNextPaState(pa
, PaLineStrip1
, PaLineStripSingle0
, 0, KNOB_SIMD_WIDTH
);
725 void PaLineStripSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t lineIndex
, __m128 verts
[])
727 simdvector
& a
= PaGetSimdVector(pa
, pa
.prev
, slot
);
728 simdvector
& b
= PaGetSimdVector(pa
, pa
.cur
, slot
);
733 verts
[0] = swizzleLane0(a
);
734 verts
[1] = swizzleLane1(a
);
737 verts
[0] = swizzleLane1(a
);
738 verts
[1] = swizzleLane2(a
);
741 verts
[0] = swizzleLane2(a
);
742 verts
[1] = swizzleLane3(a
);
745 verts
[0] = swizzleLane3(a
);
746 verts
[1] = swizzleLane4(a
);
749 verts
[0] = swizzleLane4(a
);
750 verts
[1] = swizzleLane5(a
);
753 verts
[0] = swizzleLane5(a
);
754 verts
[1] = swizzleLane6(a
);
757 verts
[0] = swizzleLane6(a
);
758 verts
[1] = swizzleLane7(a
);
761 verts
[0] = swizzleLane7(a
);
762 verts
[1] = swizzleLane0(b
);
767 bool PaPoints0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
769 simdvector
& a
= PaGetSimdVector(pa
, pa
.cur
, slot
);
771 verts
[0] = a
; // points only have 1 vertex.
773 SetNextPaState(pa
, PaPoints0
, PaPointsSingle0
, 0, KNOB_SIMD_WIDTH
, true);
777 void PaPointsSingle0(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
779 simdvector
&a
= PaGetSimdVector(pa
, pa
.cur
, slot
);
783 verts
[0] = swizzleLane0(a
);
786 verts
[0] = swizzleLane1(a
);
789 verts
[0] = swizzleLane2(a
);
792 verts
[0] = swizzleLane3(a
);
795 verts
[0] = swizzleLane4(a
);
798 verts
[0] = swizzleLane5(a
);
801 verts
[0] = swizzleLane6(a
);
804 verts
[0] = swizzleLane7(a
);
809 //////////////////////////////////////////////////////////////////////////
810 /// @brief State 1 for RECT_LIST topology.
811 /// There is not enough to assemble 8 triangles.
812 bool PaRectList0(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[])
814 SetNextPaState(pa
, PaRectList1
, PaRectListSingle0
);
818 //////////////////////////////////////////////////////////////////////////
819 /// @brief State 1 for RECT_LIST topology.
820 /// Rect lists has the following format.
822 /// v2 o---o v5 o---o v8 o---o v11 o---o
823 /// | \ | | \ | | \ | | \ |
824 /// v1 o---o v4 o---o v7 o---o v10 o---o
827 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
829 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
830 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
833 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
834 /// where v0 contains all the first vertices for 8 triangles.
837 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
838 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
839 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
841 /// @param pa - State for PA state machine.
842 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
843 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
849 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
850 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
851 simdvector
& b
= PaGetSimdVector(pa
, 1, slot
); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
853 __m256 tmp0
, tmp1
, tmp2
;
855 // Loop over each component in the simdvector.
856 for(int i
= 0; i
< 4; ++i
)
858 simdvector
& v0
= verts
[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
859 tmp0
= _mm256_permute2f128_ps(b
[i
], b
[i
], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
860 v0
[i
] = _mm256_blend_ps(a
[i
], tmp0
, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
861 tmp1
= _mm256_permute_ps(v0
[i
], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
862 v0
[i
] = _mm256_permute_ps(v0
[i
], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
863 v0
[i
] = _mm256_blend_ps(tmp1
, v0
[i
], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
865 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
866 /// AVX2 should make this much cheaper.
867 simdvector
& v1
= verts
[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
868 v1
[i
] = _mm256_permute_ps(a
[i
], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
869 tmp1
= _mm256_permute_ps(a
[i
], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
870 tmp2
= _mm256_blend_ps(v1
[i
], tmp1
, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
871 tmp1
= _mm256_permute2f128_ps(tmp2
, tmp2
, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
872 v1
[i
] = _mm256_permute_ps(tmp0
, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
873 v1
[i
] = _mm256_blend_ps(tmp2
, v1
[i
], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
874 v1
[i
] = _mm256_blend_ps(v1
[i
], tmp1
, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
876 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
877 simdvector
& v2
= verts
[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
878 v2
[i
] = _mm256_permute_ps(tmp0
, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
879 tmp1
= _mm256_permute_ps(tmp2
, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
880 v2
[i
] = _mm256_blend_ps(tmp1
, v2
[i
], 0xF0);
882 // Need to compute 4th implied vertex for the rectangle.
883 tmp2
= _mm256_sub_ps(v0
[i
], v1
[i
]);
884 tmp2
= _mm256_add_ps(tmp2
, v2
[i
]); // tmp2 = { w, *, x, *, y, *, z, * }
885 tmp2
= _mm256_permute_ps(tmp2
, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
886 v2
[i
] = _mm256_blend_ps(v2
[i
], tmp2
, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
889 SetNextPaState(pa
, PaRectList1
, PaRectListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
893 //////////////////////////////////////////////////////////////////////////
894 /// @brief State 2 for RECT_LIST topology.
895 /// Not implemented unless there is a use case for more then 8 rects.
896 /// @param pa - State for PA state machine.
897 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
898 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
904 SWR_ASSERT(0); // Is rect list used for anything other then clears?
905 SetNextPaState(pa
, PaRectList0
, PaRectListSingle0
, 0, KNOB_SIMD_WIDTH
, true);
909 //////////////////////////////////////////////////////////////////////////
910 /// @brief This procedure is called by the Binner to assemble the attributes.
911 /// Unlike position, which is stored vertically, the attributes are
912 /// stored horizontally. The outputs from the VS, labeled as 'a' and
913 /// 'b' are vertical. This function needs to transpose the lanes
914 /// containing the vertical attribute data into horizontal form.
915 /// @param pa - State for PA state machine.
916 /// @param slot - Index into VS output for a given attribute.
917 /// @param primIndex - Binner processes each triangle individually.
918 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
919 void PaRectListSingle0(
925 // We have 12 simdscalars contained within 3 simdvectors which
926 // hold at least 8 triangles worth of data. We want to assemble a single
927 // triangle with data in horizontal form.
928 simdvector
& a
= PaGetSimdVector(pa
, 0, slot
);
930 // Convert from vertical to horizontal.
934 verts
[0] = swizzleLane0(a
);
935 verts
[1] = swizzleLane1(a
);
936 verts
[2] = swizzleLane2(a
);
939 verts
[0] = swizzleLane0(a
);
940 verts
[1] = swizzleLane2(a
);
941 verts
[2] = _mm_blend_ps(verts
[0], verts
[1], 0x2);
954 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT
*in_pDC
, uint32_t in_numPrims
, uint8_t* pStream
, uint32_t in_streamSizeInVerts
,
955 bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
) : PA_STATE(in_pDC
, pStream
, in_streamSizeInVerts
), numPrims(in_numPrims
), numPrimsComplete(0), numSimdPrims(0),
956 cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming
)
958 const API_STATE
& state
= GetApiState(pDC
);
960 this->binTopology
= topo
== TOP_UNKNOWN
? state
.topology
: topo
;
962 switch (this->binTopology
)
964 case TOP_TRIANGLE_LIST
:
965 this->pfnPaFunc
= PaTriList0
;
967 case TOP_TRIANGLE_STRIP
:
968 this->pfnPaFunc
= PaTriStrip0
;
970 case TOP_TRIANGLE_FAN
:
971 this->pfnPaFunc
= PaTriFan0
;
974 this->pfnPaFunc
= PaQuadList0
;
975 this->numPrims
= in_numPrims
* 2; // Convert quad primitives into triangles
978 // quad strip pattern when decomposed into triangles is the same as verts strips
979 this->pfnPaFunc
= PaTriStrip0
;
980 this->numPrims
= in_numPrims
* 2; // Convert quad primitives into triangles
983 this->pfnPaFunc
= PaLineList0
;
984 this->numPrims
= in_numPrims
;
987 this->pfnPaFunc
= PaLineStrip0
;
988 this->numPrims
= in_numPrims
;
991 this->pfnPaFunc
= PaLineLoop0
;
992 this->numPrims
= in_numPrims
;
995 // use point binner and rasterizer if supported
996 this->pfnPaFunc
= PaPoints0
;
997 this->numPrims
= in_numPrims
;
1000 this->pfnPaFunc
= PaRectList0
;
1001 this->numPrims
= in_numPrims
* 2;
1004 case TOP_PATCHLIST_1
:
1005 this->pfnPaFunc
= PaPatchList
<1>;
1007 case TOP_PATCHLIST_2
:
1008 this->pfnPaFunc
= PaPatchList
<2>;
1010 case TOP_PATCHLIST_3
:
1011 this->pfnPaFunc
= PaPatchList
<3>;
1013 case TOP_PATCHLIST_4
:
1014 this->pfnPaFunc
= PaPatchList
<4>;
1016 case TOP_PATCHLIST_5
:
1017 this->pfnPaFunc
= PaPatchList
<5>;
1019 case TOP_PATCHLIST_6
:
1020 this->pfnPaFunc
= PaPatchList
<6>;
1022 case TOP_PATCHLIST_7
:
1023 this->pfnPaFunc
= PaPatchList
<7>;
1025 case TOP_PATCHLIST_8
:
1026 this->pfnPaFunc
= PaPatchList
<8>;
1028 case TOP_PATCHLIST_9
:
1029 this->pfnPaFunc
= PaPatchList
<9>;
1031 case TOP_PATCHLIST_10
:
1032 this->pfnPaFunc
= PaPatchList
<10>;
1034 case TOP_PATCHLIST_11
:
1035 this->pfnPaFunc
= PaPatchList
<11>;
1037 case TOP_PATCHLIST_12
:
1038 this->pfnPaFunc
= PaPatchList
<12>;
1040 case TOP_PATCHLIST_13
:
1041 this->pfnPaFunc
= PaPatchList
<13>;
1043 case TOP_PATCHLIST_14
:
1044 this->pfnPaFunc
= PaPatchList
<14>;
1046 case TOP_PATCHLIST_15
:
1047 this->pfnPaFunc
= PaPatchList
<15>;
1049 case TOP_PATCHLIST_16
:
1050 this->pfnPaFunc
= PaPatchList
<16>;
1052 case TOP_PATCHLIST_17
:
1053 this->pfnPaFunc
= PaPatchList
<17>;
1055 case TOP_PATCHLIST_18
:
1056 this->pfnPaFunc
= PaPatchList
<18>;
1058 case TOP_PATCHLIST_19
:
1059 this->pfnPaFunc
= PaPatchList
<19>;
1061 case TOP_PATCHLIST_20
:
1062 this->pfnPaFunc
= PaPatchList
<20>;
1064 case TOP_PATCHLIST_21
:
1065 this->pfnPaFunc
= PaPatchList
<21>;
1067 case TOP_PATCHLIST_22
:
1068 this->pfnPaFunc
= PaPatchList
<22>;
1070 case TOP_PATCHLIST_23
:
1071 this->pfnPaFunc
= PaPatchList
<23>;
1073 case TOP_PATCHLIST_24
:
1074 this->pfnPaFunc
= PaPatchList
<24>;
1076 case TOP_PATCHLIST_25
:
1077 this->pfnPaFunc
= PaPatchList
<25>;
1079 case TOP_PATCHLIST_26
:
1080 this->pfnPaFunc
= PaPatchList
<26>;
1082 case TOP_PATCHLIST_27
:
1083 this->pfnPaFunc
= PaPatchList
<27>;
1085 case TOP_PATCHLIST_28
:
1086 this->pfnPaFunc
= PaPatchList
<28>;
1088 case TOP_PATCHLIST_29
:
1089 this->pfnPaFunc
= PaPatchList
<29>;
1091 case TOP_PATCHLIST_30
:
1092 this->pfnPaFunc
= PaPatchList
<30>;
1094 case TOP_PATCHLIST_31
:
1095 this->pfnPaFunc
= PaPatchList
<31>;
1097 case TOP_PATCHLIST_32
:
1098 this->pfnPaFunc
= PaPatchList
<32>;
1106 this->pfnPaFuncReset
= this->pfnPaFunc
;
1108 // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1109 // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
1110 simdscalari id8
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1111 simdscalari id4
= _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
1113 switch(this->binTopology
)
1115 case TOP_TRIANGLE_LIST
:
1116 case TOP_TRIANGLE_STRIP
:
1117 case TOP_TRIANGLE_FAN
:
1118 case TOP_LINE_STRIP
:
1121 this->primIDIncr
= 8;
1125 case TOP_QUAD_STRIP
:
1127 this->primIDIncr
= 4;
1130 case TOP_POINT_LIST
:
1131 this->primIDIncr
= 8;
1134 case TOP_PATCHLIST_1
:
1135 case TOP_PATCHLIST_2
:
1136 case TOP_PATCHLIST_3
:
1137 case TOP_PATCHLIST_4
:
1138 case TOP_PATCHLIST_5
:
1139 case TOP_PATCHLIST_6
:
1140 case TOP_PATCHLIST_7
:
1141 case TOP_PATCHLIST_8
:
1142 case TOP_PATCHLIST_9
:
1143 case TOP_PATCHLIST_10
:
1144 case TOP_PATCHLIST_11
:
1145 case TOP_PATCHLIST_12
:
1146 case TOP_PATCHLIST_13
:
1147 case TOP_PATCHLIST_14
:
1148 case TOP_PATCHLIST_15
:
1149 case TOP_PATCHLIST_16
:
1150 case TOP_PATCHLIST_17
:
1151 case TOP_PATCHLIST_18
:
1152 case TOP_PATCHLIST_19
:
1153 case TOP_PATCHLIST_20
:
1154 case TOP_PATCHLIST_21
:
1155 case TOP_PATCHLIST_22
:
1156 case TOP_PATCHLIST_23
:
1157 case TOP_PATCHLIST_24
:
1158 case TOP_PATCHLIST_25
:
1159 case TOP_PATCHLIST_26
:
1160 case TOP_PATCHLIST_27
:
1161 case TOP_PATCHLIST_28
:
1162 case TOP_PATCHLIST_29
:
1163 case TOP_PATCHLIST_30
:
1164 case TOP_PATCHLIST_31
:
1165 case TOP_PATCHLIST_32
:
1166 // Always run KNOB_SIMD_WIDTH number of patches at a time.
1167 this->primIDIncr
= 8;