7f60a04a9ff7e02f9850f11df671bdf66cd05c2b
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38 enum
39 {
40 SIMD_WIDTH = KNOB_SIMD16_WIDTH,
41 SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42 SIMD_WIDTH_LOG2 = 4
43 };
44
45 typedef simd16mask SIMDMASK;
46
47 typedef simd16scalar SIMDSCALAR;
48 typedef simd16vector SIMDVECTOR;
49 typedef simd16vertex SIMDVERTEX;
50
51 typedef simd16scalari SIMDSCALARI;
52
53 #else
54 enum
55 {
56 SIMD_WIDTH = KNOB_SIMD_WIDTH,
57 SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58 SIMD_WIDTH_LOG2 = 3
59 };
60
61 typedef simdmask SIMDMASK;
62
63 typedef simdscalar SIMDSCALAR;
64 typedef simdvector SIMDVECTOR;
65 typedef simdvertex SIMDVERTEX;
66
67 typedef simdscalari SIMDSCALARI;
68
69 #endif
70 DRAW_CONTEXT *pDC{ nullptr }; // draw context
71 uint8_t* pStreamBase{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
73
74 // The topology the binner will use. In some cases the FE changes the topology from the api state.
75 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
76
77 #if ENABLE_AVX512_SIMD16
78 bool useAlternateOffset{ false };
79
80 #endif
81 PA_STATE() {}
82 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
83 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
84
85 virtual bool HasWork() = 0;
86 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
87 #if ENABLE_AVX512_SIMD16
88 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
89 #endif
90 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
91 #if ENABLE_AVX512_SIMD16
92 virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
93 #endif
94 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
95 virtual bool NextPrim() = 0;
96 virtual SIMDVERTEX& GetNextVsOutput() = 0;
97 virtual bool GetNextStreamOutput() = 0;
98 virtual SIMDMASK& GetNextVsIndices() = 0;
99 virtual uint32_t NumPrims() = 0;
100 virtual void Reset() = 0;
101 virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
102 };
103
104 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
105 // output. Here is the sequence
106 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
107 // 2. Execute PA function to assemble and bin triangles.
108 // a. The PA function is a set of functions that collectively make up the
109 // state machine for a given topology.
110 // 1. We use a state index to track which PA function to call.
111 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
112 // 1. We call this the current and previous simd vertex.
113 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
114 // order to assemble the second triangle, for a triangle list, we'll need the
115 // last vertex from the previous simd and the first 2 vertices from the current simd.
116 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
117 //
118 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
119 // cuts
120 struct PA_STATE_OPT : public PA_STATE
121 {
122 SIMDVERTEX leadingVertex; // For tri-fan
123
124 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
125 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
126
127 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
128
129 uint32_t cur{ 0 }; // index to current VS output.
130 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
131 uint32_t first{ 0 }; // index to first VS output. Used for trifan.
132
133 uint32_t counter{ 0 }; // state counter
134 bool reset{ false }; // reset state
135
136 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
137 SIMDSCALARI primID;
138
139 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
140 #if ENABLE_AVX512_SIMD16
141 typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
142 #endif
143 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
144
145 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
146 #if ENABLE_AVX512_SIMD16
147 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
148 #endif
149 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
150 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
151 #if ENABLE_AVX512_SIMD16
152 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
153 #endif
154
155 // state used to advance the PA when Next is called
156 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
157 #if ENABLE_AVX512_SIMD16
158 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
159 #endif
160 uint32_t nextNumSimdPrims{ 0 };
161 uint32_t nextNumPrimsIncrement{ 0 };
162 bool nextReset{ false };
163 bool isStreaming{ false };
164
165 SIMDMASK tmpIndices{ 0 }; // temporary index store for unused virtual function
166
167 PA_STATE_OPT() {}
168 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
169 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
170
171 bool HasWork()
172 {
173 return (this->numPrimsComplete < this->numPrims) ? true : false;
174 }
175
176 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
177 {
178 simdvertex* pVertex = (simdvertex*)pStreamBase;
179 return pVertex[index].attrib[slot];
180 }
181
182 #if ENABLE_AVX512_SIMD16
183 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
184 {
185 simd16vertex* pVertex = (simd16vertex*)pStreamBase;
186 return pVertex[index].attrib[slot];
187 }
188
189 #endif
190 // Assembles 4 triangles. Each simdvector is a single vertex from 4
191 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
192 bool Assemble(uint32_t slot, simdvector verts[])
193 {
194 return this->pfnPaFunc(*this, slot, verts);
195 }
196
197 #if ENABLE_AVX512_SIMD16
198 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
199 {
200 return this->pfnPaFunc_simd16(*this, slot, verts);
201 }
202
203 #endif
204 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
205 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
206 {
207 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
208 }
209
210 bool NextPrim()
211 {
212 this->pfnPaFunc = this->pfnPaNextFunc;
213 #if ENABLE_AVX512_SIMD16
214 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
215 #endif
216 this->numSimdPrims = this->nextNumSimdPrims;
217 this->numPrimsComplete += this->nextNumPrimsIncrement;
218 this->reset = this->nextReset;
219
220 if (this->isStreaming)
221 {
222 this->reset = false;
223 }
224
225 bool morePrims = false;
226
227 if (this->numSimdPrims > 0)
228 {
229 morePrims = true;
230 this->numSimdPrims--;
231 }
232 else
233 {
234 this->counter = (this->reset) ? 0 : (this->counter + 1);
235 this->reset = false;
236 }
237
238 this->pfnPaFunc = this->pfnPaNextFunc;
239
240 if (!HasWork())
241 {
242 morePrims = false; // no more to do
243 }
244
245 return morePrims;
246 }
247
248 SIMDVERTEX& GetNextVsOutput()
249 {
250 // increment cur and prev indices
251 const uint32_t numSimdVerts = this->streamSizeInVerts / SIMD_WIDTH;
252 this->prev = this->cur; // prev is undefined for first state.
253 this->cur = this->counter % numSimdVerts;
254
255 SIMDVERTEX* pVertex = (SIMDVERTEX*)pStreamBase;
256 return pVertex[this->cur];
257 }
258
259 SIMDMASK& GetNextVsIndices()
260 {
261 // unused in optimized PA, pass tmp buffer back
262 return tmpIndices;
263 }
264
265 bool GetNextStreamOutput()
266 {
267 this->prev = this->cur;
268 this->cur = this->counter;
269
270 return HasWork();
271 }
272
273 uint32_t NumPrims()
274 {
275 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
276 (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
277 }
278
279 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
280 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
281 uint32_t numSimdPrims = 0,
282 uint32_t numPrimsIncrement = 0,
283 bool reset = false)
284 {
285 this->pfnPaNextFunc = pfnPaNextFunc;
286 this->nextNumSimdPrims = numSimdPrims;
287 this->nextNumPrimsIncrement = numPrimsIncrement;
288 this->nextReset = reset;
289
290 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
291 }
292
293 #if ENABLE_AVX512_SIMD16
294 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
295 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
296 uint32_t numSimdPrims = 0,
297 uint32_t numPrimsIncrement = 0,
298 bool reset = false)
299 {
300 this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
301 this->nextNumSimdPrims = numSimdPrims;
302 this->nextNumPrimsIncrement = numPrimsIncrement;
303 this->nextReset = reset;
304
305 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
306 }
307
308 #endif
309 void Reset()
310 {
311 #if ENABLE_AVX512_SIMD16
312 useAlternateOffset = false;
313
314 #endif
315 this->pfnPaFunc = this->pfnPaFuncReset;
316 this->numPrimsComplete = 0;
317 this->numSimdPrims = 0;
318 this->cur = 0;
319 this->prev = 0;
320 this->first = 0;
321 this->counter = 0;
322 this->reset = false;
323 }
324
325 SIMDSCALARI GetPrimID(uint32_t startID)
326 {
327 #if USE_SIMD16_FRONTEND
328 return _simd16_add_epi32(this->primID,
329 _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
330 #else
331 return _simd_add_epi32(this->primID,
332 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
333 #endif
334 }
335 };
336
337 // helper C wrappers to avoid having to rewrite all the PA topology state functions
338 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
339 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
340 uint32_t numSimdPrims = 0,
341 uint32_t numPrimsIncrement = 0,
342 bool reset = false)
343 {
344 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
345 }
346
347 #if ENABLE_AVX512_SIMD16
348 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
349 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
350 uint32_t numSimdPrims = 0,
351 uint32_t numPrimsIncrement = 0,
352 bool reset = false)
353 {
354 return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
355 }
356
357 #endif
358 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
359 {
360 return pa.GetSimdVector(index, slot);
361 }
362
363 #if ENABLE_AVX512_SIMD16
364 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
365 {
366 return pa.GetSimdVector_simd16(index, slot);
367 }
368
369 #endif
370 INLINE __m128 swizzleLane0(const simdvector &a)
371 {
372 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
373 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
374 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
375 }
376
377 INLINE __m128 swizzleLane1(const simdvector &a)
378 {
379 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
380 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
381 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
382 }
383
384 INLINE __m128 swizzleLane2(const simdvector &a)
385 {
386 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
387 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
388 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
389 }
390
391 INLINE __m128 swizzleLane3(const simdvector &a)
392 {
393 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
394 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
395 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
396 }
397
398 INLINE __m128 swizzleLane4(const simdvector &a)
399 {
400 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
401 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
402 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
403
404 }
405
406 INLINE __m128 swizzleLane5(const simdvector &a)
407 {
408 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
409 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
410 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
411 }
412
413 INLINE __m128 swizzleLane6(const simdvector &a)
414 {
415 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
416 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
417 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
418 }
419
420 INLINE __m128 swizzleLane7(const simdvector &a)
421 {
422 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
423 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
424 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
425 }
426
427 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
428 {
429 switch (lane) {
430 case 0:
431 return swizzleLane0(a);
432 case 1:
433 return swizzleLane1(a);
434 case 2:
435 return swizzleLane2(a);
436 case 3:
437 return swizzleLane3(a);
438 case 4:
439 return swizzleLane4(a);
440 case 5:
441 return swizzleLane5(a);
442 case 6:
443 return swizzleLane6(a);
444 case 7:
445 return swizzleLane7(a);
446 default:
447 return _mm_setzero_ps();
448 }
449 }
450
451 // Cut-aware primitive assembler.
452 struct PA_STATE_CUT : public PA_STATE
453 {
454 SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
455 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
456 uint32_t numAttribs{ 0 }; // number of attributes
457 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
458 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
459 #if ENABLE_AVX512_SIMD16
460 OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
461 #else
462 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
463 #endif
464 SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
465 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
466 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
467 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
468 uint32_t curVertex{ 0 }; // current unprocessed vertex
469 uint32_t startPrimId{ 0 }; // starting prim id
470 SIMDSCALARI vPrimId; // vector of prim ID
471 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
472 uint32_t vertsPerPrim{ 0 };
473 SIMDVERTEX tmpVertex; // temporary simdvertex for unimplemented API
474 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
475 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
476 // while the GS sends valid verts for every index
477 // Topology state tracking
478 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
479 uint32_t curIndex{ 0 };
480 bool reverseWinding{ false }; // indicates reverse winding for strips
481 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
482
483 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
484 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
485
486 PA_STATE_CUT() {}
487 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, SIMDMASK* in_pIndices, uint32_t in_numVerts,
488 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
489 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
490 {
491 numVerts = in_streamSizeInVerts;
492 numAttribs = in_numAttribs;
493 binTopology = topo;
494 needOffsets = false;
495 processCutVerts = in_processCutVerts;
496
497 numVertsToAssemble = numRemainingVerts = in_numVerts;
498 numPrimsAssembled = 0;
499 headVertex = tailVertex = curVertex = 0;
500
501 curIndex = 0;
502 pCutIndices = in_pIndices;
503 memset(indices, 0, sizeof(indices));
504 #if USE_SIMD16_FRONTEND
505 vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
506 #else
507 vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
508 #endif
509 reverseWinding = false;
510 adjExtraVert = -1;
511
512 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
513 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
514
515 switch (topo)
516 {
517 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
518 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
519 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
520 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
521 {
522 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
523 }
524 else
525 {
526 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
527 }
528 break;
529
530 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
531 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
532 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
533 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
534 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
535 default: assert(0 && "Unimplemented topology");
536 }
537 }
538
539 SIMDVERTEX& GetNextVsOutput()
540 {
541 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
542 this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
543 this->needOffsets = true;
544 return ((SIMDVERTEX*)pStreamBase)[vertexIndex];
545 }
546
547 SIMDMASK& GetNextVsIndices()
548 {
549 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
550 SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
551 return *pCurCutIndex;
552 }
553
554 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
555 {
556 // unused
557 SWR_ASSERT(0 && "Not implemented");
558 static simdvector junk;
559 return junk;
560 }
561
562 #if ENABLE_AVX512_SIMD16
563 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
564 {
565 // unused
566 SWR_ASSERT(0 && "Not implemented");
567 static simd16vector junk;
568 return junk;
569 }
570
571 #endif
572 bool GetNextStreamOutput()
573 {
574 this->headVertex += SIMD_WIDTH;
575 this->needOffsets = true;
576 return HasWork();
577 }
578
579 SIMDSCALARI GetPrimID(uint32_t startID)
580 {
581 #if USE_SIMD16_FRONTEND
582 return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
583 #else
584 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
585 #endif
586 }
587
588 void Reset()
589 {
590 #if ENABLE_AVX512_SIMD16
591 useAlternateOffset = false;
592
593 #endif
594 this->numRemainingVerts = this->numVertsToAssemble;
595 this->numPrimsAssembled = 0;
596 this->curIndex = 0;
597 this->curVertex = 0;
598 this->tailVertex = 0;
599 this->headVertex = 0;
600 this->reverseWinding = false;
601 this->adjExtraVert = -1;
602 #if USE_SIMD16_FRONTEND
603 this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
604 #else
605 this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
606 #endif
607 }
608
609 bool HasWork()
610 {
611 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
612 }
613
614 bool IsVertexStoreFull()
615 {
616 return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
617 }
618
619 void RestartTopology()
620 {
621 this->curIndex = 0;
622 this->reverseWinding = false;
623 this->adjExtraVert = -1;
624 }
625
626 bool IsCutIndex(uint32_t vertex)
627 {
628 uint32_t vertexIndex = vertex / SIMD_WIDTH;
629 uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
630 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
631 }
632
633 // iterates across the unprocessed verts until we hit the end or we
634 // have assembled SIMD prims
635 void ProcessVerts()
636 {
637 while (this->numPrimsAssembled != SIMD_WIDTH &&
638 this->numRemainingVerts > 0 &&
639 this->curVertex != this->headVertex)
640 {
641 // if cut index, restart topology
642 if (IsCutIndex(this->curVertex))
643 {
644 if (this->processCutVerts)
645 {
646 (this->*pfnPa)(this->curVertex, false);
647 }
648 // finish off tri strip w/ adj before restarting topo
649 if (this->adjExtraVert != -1)
650 {
651 (this->*pfnPa)(this->curVertex, true);
652 }
653 RestartTopology();
654 }
655 else
656 {
657 (this->*pfnPa)(this->curVertex, false);
658 }
659
660 this->curVertex++;
661 if (this->curVertex >= this->numVerts) {
662 this->curVertex = 0;
663 }
664 this->numRemainingVerts--;
665 }
666
667 // special case last primitive for tri strip w/ adj
668 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
669 {
670 (this->*pfnPa)(this->curVertex, true);
671 }
672 }
673
674 void Advance()
675 {
676 // done with current batch
677 // advance tail to the current unsubmitted vertex
678 this->tailVertex = this->curVertex;
679 this->numPrimsAssembled = 0;
680 #if USE_SIMD16_FRONTEND
681 this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
682 #else
683 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
684 #endif
685 }
686
687 bool NextPrim()
688 {
689 // if we've assembled enough prims, we can advance to the next set of verts
690 if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
691 {
692 Advance();
693 }
694 return false;
695 }
696
697 void ComputeOffsets()
698 {
699 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
700 {
701 SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
702
703 // step to simdvertex batch
704 const uint32_t simdShift = SIMD_WIDTH_LOG2;
705 #if USE_SIMD16_FRONTEND
706 SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
707 this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(sizeof(SIMDVERTEX)));
708 #else
709 SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
710 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(SIMDVERTEX)));
711 #endif
712
713 // step to index
714 const uint32_t simdMask = SIMD_WIDTH - 1;
715 #if USE_SIMD16_FRONTEND
716 SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
717 this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
718 #else
719 SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
720 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
721 #endif
722 }
723 }
724
725 bool Assemble(uint32_t slot, simdvector verts[])
726 {
727 // process any outstanding verts
728 ProcessVerts();
729
730 // return false if we don't have enough prims assembled
731 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
732 {
733 return false;
734 }
735
736 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
737 if (this->needOffsets)
738 {
739 ComputeOffsets();
740 this->needOffsets = false;
741 }
742
743 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
744 {
745 SIMDSCALARI offsets = this->vOffsets[v];
746
747 // step to attribute
748 #if USE_SIMD16_FRONTEND
749 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
750 #else
751 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
752 #endif
753
754 float* pBase = (float*)this->pStreamBase;
755 for (uint32_t c = 0; c < 4; ++c)
756 {
757 #if USE_SIMD16_FRONTEND
758 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
759
760 verts[v].v[c] = useAlternateOffset ? temp.hi : temp.lo;
761 #else
762 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
763 #endif
764
765 // move base to next component
766 pBase += SIMD_WIDTH;
767 }
768 }
769
770 return true;
771 }
772
773 #if ENABLE_AVX512_SIMD16
774 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
775 {
776 // process any outstanding verts
777 ProcessVerts();
778
779 // return false if we don't have enough prims assembled
780 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
781 {
782 return false;
783 }
784
785 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
786 if (this->needOffsets)
787 {
788 ComputeOffsets();
789 this->needOffsets = false;
790 }
791
792 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
793 {
794 SIMDSCALARI offsets = this->vOffsets[v];
795
796 // step to attribute
797 #if USE_SIMD16_FRONTEND
798 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
799 #else
800 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
801 #endif
802
803 float* pBase = (float*)this->pStreamBase;
804 for (uint32_t c = 0; c < 4; ++c)
805 {
806 #if USE_SIMD16_FRONTEND
807 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
808 #else
809 verts[v].v[c].lo = _simd_i32gather_ps(pBase, offsets, 1);
810 verts[v].v[c].hi = _simd_setzero_ps();
811 #endif
812
813 // move base to next component
814 pBase += SIMD_WIDTH;
815 }
816 }
817
818 return true;
819 }
820
821 #endif
822 void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
823 {
824 // move to slot
825 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
826 {
827 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
828 #if USE_SIMD16_FRONTEND
829 uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
830 #else
831 uint32_t offset = pOffset[triIndex];
832 #endif
833 offset += sizeof(SIMDVECTOR) * slot;
834 float* pVert = (float*)&tri[v];
835 for (uint32_t c = 0; c < 4; ++c)
836 {
837 float* pComponent = (float*)(this->pStreamBase + offset);
838 pVert[c] = *pComponent;
839 offset += SIMD_WIDTH * sizeof(float);
840 }
841 }
842 }
843
844 uint32_t NumPrims()
845 {
846 return this->numPrimsAssembled;
847 }
848
849 // Per-topology functions
850 void ProcessVertTriStrip(uint32_t index, bool finish)
851 {
852 this->vert[this->curIndex] = index;
853 this->curIndex++;
854 if (this->curIndex == 3)
855 {
856 // assembled enough verts for prim, add to gather indices
857 this->indices[0][this->numPrimsAssembled] = this->vert[0];
858 if (reverseWinding)
859 {
860 this->indices[1][this->numPrimsAssembled] = this->vert[2];
861 this->indices[2][this->numPrimsAssembled] = this->vert[1];
862 }
863 else
864 {
865 this->indices[1][this->numPrimsAssembled] = this->vert[1];
866 this->indices[2][this->numPrimsAssembled] = this->vert[2];
867 }
868
869 // increment numPrimsAssembled
870 this->numPrimsAssembled++;
871
872 // set up next prim state
873 this->vert[0] = this->vert[1];
874 this->vert[1] = this->vert[2];
875 this->curIndex = 2;
876 this->reverseWinding ^= 1;
877 }
878 }
879
880 template<bool gsEnabled>
881 void AssembleTriStripAdj()
882 {
883 if (!gsEnabled)
884 {
885 this->vert[1] = this->vert[2];
886 this->vert[2] = this->vert[4];
887
888 this->indices[0][this->numPrimsAssembled] = this->vert[0];
889 this->indices[1][this->numPrimsAssembled] = this->vert[1];
890 this->indices[2][this->numPrimsAssembled] = this->vert[2];
891
892 this->vert[4] = this->vert[2];
893 this->vert[2] = this->vert[1];
894 }
895 else
896 {
897 this->indices[0][this->numPrimsAssembled] = this->vert[0];
898 this->indices[1][this->numPrimsAssembled] = this->vert[1];
899 this->indices[2][this->numPrimsAssembled] = this->vert[2];
900 this->indices[3][this->numPrimsAssembled] = this->vert[3];
901 this->indices[4][this->numPrimsAssembled] = this->vert[4];
902 this->indices[5][this->numPrimsAssembled] = this->vert[5];
903 }
904 this->numPrimsAssembled++;
905 }
906
907
908 template<bool gsEnabled>
909 void ProcessVertTriStripAdj(uint32_t index, bool finish)
910 {
911 // handle last primitive of tristrip
912 if (finish && this->adjExtraVert != -1)
913 {
914 this->vert[3] = this->adjExtraVert;
915 AssembleTriStripAdj<gsEnabled>();
916 this->adjExtraVert = -1;
917 return;
918 }
919
920 switch (this->curIndex)
921 {
922 case 0:
923 case 1:
924 case 2:
925 case 4:
926 this->vert[this->curIndex] = index;
927 this->curIndex++;
928 break;
929 case 3:
930 this->vert[5] = index;
931 this->curIndex++;
932 break;
933 case 5:
934 if (this->adjExtraVert == -1)
935 {
936 this->adjExtraVert = index;
937 }
938 else
939 {
940 this->vert[3] = index;
941 if (!gsEnabled)
942 {
943 AssembleTriStripAdj<gsEnabled>();
944
945 uint32_t nextTri[6];
946 if (this->reverseWinding)
947 {
948 nextTri[0] = this->vert[4];
949 nextTri[1] = this->vert[0];
950 nextTri[2] = this->vert[2];
951 nextTri[4] = this->vert[3];
952 nextTri[5] = this->adjExtraVert;
953 }
954 else
955 {
956 nextTri[0] = this->vert[2];
957 nextTri[1] = this->adjExtraVert;
958 nextTri[2] = this->vert[3];
959 nextTri[4] = this->vert[4];
960 nextTri[5] = this->vert[0];
961 }
962 for (uint32_t i = 0; i < 6; ++i)
963 {
964 this->vert[i] = nextTri[i];
965 }
966
967 this->adjExtraVert = -1;
968 this->reverseWinding ^= 1;
969 }
970 else
971 {
972 this->curIndex++;
973 }
974 }
975 break;
976 case 6:
977 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
978 AssembleTriStripAdj<gsEnabled>();
979
980 uint32_t nextTri[6];
981 if (this->reverseWinding)
982 {
983 nextTri[0] = this->vert[4];
984 nextTri[1] = this->vert[0];
985 nextTri[2] = this->vert[2];
986 nextTri[4] = this->vert[3];
987 nextTri[5] = this->adjExtraVert;
988 }
989 else
990 {
991 nextTri[0] = this->vert[2];
992 nextTri[1] = this->adjExtraVert;
993 nextTri[2] = this->vert[3];
994 nextTri[4] = this->vert[4];
995 nextTri[5] = this->vert[0];
996 }
997 for (uint32_t i = 0; i < 6; ++i)
998 {
999 this->vert[i] = nextTri[i];
1000 }
1001 this->reverseWinding ^= 1;
1002 this->adjExtraVert = index;
1003 this->curIndex--;
1004 break;
1005 }
1006 }
1007
1008 void ProcessVertTriList(uint32_t index, bool finish)
1009 {
1010 this->vert[this->curIndex] = index;
1011 this->curIndex++;
1012 if (this->curIndex == 3)
1013 {
1014 // assembled enough verts for prim, add to gather indices
1015 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1016 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1017 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1018
1019 // increment numPrimsAssembled
1020 this->numPrimsAssembled++;
1021
1022 // set up next prim state
1023 this->curIndex = 0;
1024 }
1025 }
1026
1027 void ProcessVertTriListAdj(uint32_t index, bool finish)
1028 {
1029 this->vert[this->curIndex] = index;
1030 this->curIndex++;
1031 if (this->curIndex == 6)
1032 {
1033 // assembled enough verts for prim, add to gather indices
1034 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1035 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1036 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1037 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1038 this->indices[4][this->numPrimsAssembled] = this->vert[4];
1039 this->indices[5][this->numPrimsAssembled] = this->vert[5];
1040
1041 // increment numPrimsAssembled
1042 this->numPrimsAssembled++;
1043
1044 // set up next prim state
1045 this->curIndex = 0;
1046 }
1047 }
1048
1049 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
1050 {
1051 this->vert[this->curIndex] = index;
1052 this->curIndex++;
1053 if (this->curIndex == 6)
1054 {
1055 // assembled enough verts for prim, add to gather indices
1056 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1057 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1058 this->indices[2][this->numPrimsAssembled] = this->vert[4];
1059
1060 // increment numPrimsAssembled
1061 this->numPrimsAssembled++;
1062
1063 // set up next prim state
1064 this->curIndex = 0;
1065 }
1066 }
1067
1068
1069 void ProcessVertLineList(uint32_t index, bool finish)
1070 {
1071 this->vert[this->curIndex] = index;
1072 this->curIndex++;
1073 if (this->curIndex == 2)
1074 {
1075 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1076 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1077
1078 this->numPrimsAssembled++;
1079 this->curIndex = 0;
1080 }
1081 }
1082
1083 void ProcessVertLineStrip(uint32_t index, bool finish)
1084 {
1085 this->vert[this->curIndex] = index;
1086 this->curIndex++;
1087 if (this->curIndex == 2)
1088 {
1089 // assembled enough verts for prim, add to gather indices
1090 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1091 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1092
1093 // increment numPrimsAssembled
1094 this->numPrimsAssembled++;
1095
1096 // set up next prim state
1097 this->vert[0] = this->vert[1];
1098 this->curIndex = 1;
1099 }
1100 }
1101
1102 void ProcessVertLineStripAdj(uint32_t index, bool finish)
1103 {
1104 this->vert[this->curIndex] = index;
1105 this->curIndex++;
1106 if (this->curIndex == 4)
1107 {
1108 // assembled enough verts for prim, add to gather indices
1109 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1110 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1111 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1112 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1113
1114 // increment numPrimsAssembled
1115 this->numPrimsAssembled++;
1116
1117 // set up next prim state
1118 this->vert[0] = this->vert[1];
1119 this->vert[1] = this->vert[2];
1120 this->vert[2] = this->vert[3];
1121 this->curIndex = 3;
1122 }
1123 }
1124
1125 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1126 {
1127 this->vert[this->curIndex] = index;
1128 this->curIndex++;
1129 if (this->curIndex == 4)
1130 {
1131 // assembled enough verts for prim, add to gather indices
1132 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1133 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1134
1135 // increment numPrimsAssembled
1136 this->numPrimsAssembled++;
1137
1138 // set up next prim state
1139 this->vert[0] = this->vert[1];
1140 this->vert[1] = this->vert[2];
1141 this->vert[2] = this->vert[3];
1142 this->curIndex = 3;
1143 }
1144 }
1145
1146 void ProcessVertLineListAdj(uint32_t index, bool finish)
1147 {
1148 this->vert[this->curIndex] = index;
1149 this->curIndex++;
1150 if (this->curIndex == 4)
1151 {
1152 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1153 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1154 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1155 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1156
1157 this->numPrimsAssembled++;
1158 this->curIndex = 0;
1159 }
1160 }
1161
1162 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1163 {
1164 this->vert[this->curIndex] = index;
1165 this->curIndex++;
1166 if (this->curIndex == 4)
1167 {
1168 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1169 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1170
1171 this->numPrimsAssembled++;
1172 this->curIndex = 0;
1173 }
1174 }
1175
1176 void ProcessVertPointList(uint32_t index, bool finish)
1177 {
1178 this->vert[this->curIndex] = index;
1179 this->curIndex++;
1180 if (this->curIndex == 1)
1181 {
1182 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1183 this->numPrimsAssembled++;
1184 this->curIndex = 0;
1185 }
1186 }
1187 };
1188
1189 // Primitive Assembly for data output from the DomainShader.
1190 struct PA_TESS : PA_STATE
1191 {
1192 PA_TESS(
1193 DRAW_CONTEXT *in_pDC,
1194 const SIMDSCALAR* in_pVertData,
1195 uint32_t in_attributeStrideInVectors,
1196 uint32_t in_numAttributes,
1197 uint32_t* (&in_ppIndices)[3],
1198 uint32_t in_numPrims,
1199 PRIMITIVE_TOPOLOGY in_binTopology) :
1200
1201 PA_STATE(in_pDC, nullptr, 0),
1202 m_pVertexData(in_pVertData),
1203 m_attributeStrideInVectors(in_attributeStrideInVectors),
1204 m_numAttributes(in_numAttributes),
1205 m_numPrims(in_numPrims)
1206 {
1207 #if USE_SIMD16_FRONTEND
1208 m_vPrimId = _simd16_setzero_si();
1209 #else
1210 m_vPrimId = _simd_setzero_si();
1211 #endif
1212 binTopology = in_binTopology;
1213 m_ppIndices[0] = in_ppIndices[0];
1214 m_ppIndices[1] = in_ppIndices[1];
1215 m_ppIndices[2] = in_ppIndices[2];
1216
1217 switch (binTopology)
1218 {
1219 case TOP_POINT_LIST:
1220 m_numVertsPerPrim = 1;
1221 break;
1222
1223 case TOP_LINE_LIST:
1224 m_numVertsPerPrim = 2;
1225 break;
1226
1227 case TOP_TRIANGLE_LIST:
1228 m_numVertsPerPrim = 3;
1229 break;
1230
1231 default:
1232 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1233 break;
1234 }
1235 }
1236
1237 bool HasWork()
1238 {
1239 return m_numPrims != 0;
1240 }
1241
1242 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1243 {
1244 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1245 static simdvector junk;
1246 return junk;
1247 }
1248
1249 #if ENABLE_AVX512_SIMD16
1250 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1251 {
1252 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1253 static simd16vector junk;
1254 return junk;
1255 }
1256
1257 #endif
1258 static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1259 {
1260 SWR_ASSERT(numPrims <= SIMD_WIDTH);
1261 #if USE_SIMD16_FRONTEND
1262 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1263 {
1264 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1266 };
1267
1268 return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1269 #else
1270 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1271 {
1272 -1, -1, -1, -1, -1, -1, -1, -1,
1273 0, 0, 0, 0, 0, 0, 0, 0
1274 };
1275
1276 return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1277 #endif
1278 }
1279
1280 bool Assemble(uint32_t slot, simdvector verts[])
1281 {
1282 SWR_ASSERT(slot < m_numAttributes);
1283
1284 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1285 if (0 == numPrimsToAssemble)
1286 {
1287 return false;
1288 }
1289
1290 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1291
1292 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1293 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1294 {
1295 #if USE_SIMD16_FRONTEND
1296 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1297 #else
1298 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1299 #endif
1300
1301 const float* pBase = pBaseAttrib;
1302 for (uint32_t c = 0; c < 4; ++c)
1303 {
1304 #if USE_SIMD16_FRONTEND
1305 simd16scalar temp = _simd16_mask_i32gather_ps(
1306 _simd16_setzero_ps(),
1307 pBase,
1308 indices,
1309 mask,
1310 4 /* gcc doesn't like sizeof(float) */);
1311
1312 verts[i].v[c] = useAlternateOffset ? temp.hi : temp.lo;
1313 #else
1314 verts[i].v[c] = _simd_mask_i32gather_ps(
1315 _simd_setzero_ps(),
1316 pBase,
1317 indices,
1318 _simd_castsi_ps(mask),
1319 4 /* gcc doesn't like sizeof(float) */);
1320 #endif
1321 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1322 }
1323 }
1324
1325 return true;
1326 }
1327
1328 #if ENABLE_AVX512_SIMD16
1329 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1330 {
1331 SWR_ASSERT(slot < m_numAttributes);
1332
1333 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1334 if (0 == numPrimsToAssemble)
1335 {
1336 return false;
1337 }
1338
1339 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1340
1341 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1342 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1343 {
1344 #if USE_SIMD16_FRONTEND
1345 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1346 #else
1347 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1348 #endif
1349
1350 const float* pBase = pBaseAttrib;
1351 for (uint32_t c = 0; c < 4; ++c)
1352 {
1353 #if USE_SIMD16_FRONTEND
1354 verts[i].v[c] = _simd16_mask_i32gather_ps(
1355 _simd16_setzero_ps(),
1356 pBase,
1357 indices,
1358 mask,
1359 4 /* gcc doesn't like sizeof(float) */);
1360 #else
1361 verts[i].v[c].lo = _simd_mask_i32gather_ps(
1362 _simd_setzero_ps(),
1363 pBase,
1364 indices,
1365 _simd_castsi_ps(mask),
1366 4 /* gcc doesn't like sizeof(float) */);
1367 verts[i].v[c].hi = _simd_setzero_ps();
1368 #endif
1369 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1370 }
1371 }
1372
1373 return true;
1374 }
1375
1376 #endif
1377 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1378 {
1379 SWR_ASSERT(slot < m_numAttributes);
1380 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1381
1382 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1383 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1384 {
1385 #if USE_SIMD16_FRONTEND
1386 uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1387 #else
1388 uint32_t index = m_ppIndices[i][primIndex];
1389 #endif
1390 const float* pVertData = pVertDataBase;
1391 float* pVert = (float*)&verts[i];
1392
1393 for (uint32_t c = 0; c < 4; ++c)
1394 {
1395 pVert[c] = pVertData[index];
1396 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1397 }
1398 }
1399 }
1400
1401 bool NextPrim()
1402 {
1403 uint32_t numPrims = PA_TESS::NumPrims();
1404 m_numPrims -= numPrims;
1405 m_ppIndices[0] += numPrims;
1406 m_ppIndices[1] += numPrims;
1407 m_ppIndices[2] += numPrims;
1408
1409 return HasWork();
1410 }
1411
1412 SIMDVERTEX& GetNextVsOutput()
1413 {
1414 SWR_ASSERT(0, "%s", __FUNCTION__);
1415 static SIMDVERTEX junk;
1416 return junk;
1417 }
1418
1419 bool GetNextStreamOutput()
1420 {
1421 SWR_ASSERT(0, "%s", __FUNCTION__);
1422 return false;
1423 }
1424
1425 SIMDMASK& GetNextVsIndices()
1426 {
1427 SWR_ASSERT(0, "%s", __FUNCTION__);
1428 static SIMDMASK junk;
1429 return junk;
1430 }
1431
1432 uint32_t NumPrims()
1433 {
1434 return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1435 }
1436
1437 void Reset() { SWR_ASSERT(0); };
1438
1439 SIMDSCALARI GetPrimID(uint32_t startID)
1440 {
1441 #if USE_SIMD16_FRONTEND
1442 return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1443 #else
1444 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1445 #endif
1446 }
1447
1448 private:
1449 const SIMDSCALAR* m_pVertexData = nullptr;
1450 uint32_t m_attributeStrideInVectors = 0;
1451 uint32_t m_numAttributes = 0;
1452 uint32_t m_numPrims = 0;
1453 uint32_t* m_ppIndices[3];
1454
1455 uint32_t m_numVertsPerPrim = 0;
1456
1457 SIMDSCALARI m_vPrimId;
1458 };
1459
1460 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1461 // based on state.
1462 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1463 struct PA_FACTORY
1464 {
1465 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1466 {
1467 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1468 const API_STATE& state = GetApiState(pDC);
1469 if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1470 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1471 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1472 topo == TOP_TRIANGLE_LIST)) ||
1473
1474 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1475 // for them in the optimized PA
1476 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1477 {
1478 memset(&indexStore, 0, sizeof(indexStore));
1479 uint32_t numAttribs = state.feNumAttributes;
1480
1481 new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH,
1482 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1483 cutPA = true;
1484 }
1485 else
1486 #endif
1487 {
1488 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1489 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false);
1490 cutPA = false;
1491 }
1492
1493 }
1494
1495 PA_STATE& GetPA()
1496 {
1497 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1498 if (cutPA)
1499 {
1500 return this->paCut;
1501 }
1502 else
1503 #endif
1504 {
1505 return this->paOpt;
1506 }
1507 }
1508
1509 PA_STATE_OPT paOpt;
1510 PA_STATE_CUT paCut;
1511 bool cutPA{ false };
1512
1513 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1514
1515 PA_STATE::SIMDVERTEX vertexStore[MAX_NUM_VERTS_PER_PRIM];
1516 PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1517 };