4bb3236a638550a771a2ba9d33b5d1cf17d3c866
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38 enum
39 {
40 SIMD_WIDTH = KNOB_SIMD16_WIDTH,
41 SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42 SIMD_WIDTH_LOG2 = 4
43 };
44
45 typedef simd16mask SIMDMASK;
46
47 typedef simd16scalar SIMDSCALAR;
48 typedef simd16vector SIMDVECTOR;
49 typedef simd16vertex SIMDVERTEX;
50
51 typedef simd16scalari SIMDSCALARI;
52
53 #else
54 enum
55 {
56 SIMD_WIDTH = KNOB_SIMD_WIDTH,
57 SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58 SIMD_WIDTH_LOG2 = 3
59 };
60
61 typedef simdmask SIMDMASK;
62
63 typedef simdscalar SIMDSCALAR;
64 typedef simdvector SIMDVECTOR;
65 typedef simdvertex SIMDVERTEX;
66
67 typedef simdscalari SIMDSCALARI;
68
69 #endif
70 DRAW_CONTEXT *pDC{ nullptr }; // draw context
71 uint8_t* pStreamBase{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
73 uint32_t vertexStride{ 0 }; // stride of a vertex in simdvector units
74
75 // The topology the binner will use. In some cases the FE changes the topology from the api state.
76 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
77
78 #if ENABLE_AVX512_SIMD16
79 bool useAlternateOffset{ false };
80
81 #endif
82 PA_STATE() {}
83 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
84 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
85
86 virtual bool HasWork() = 0;
87 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
88 #if ENABLE_AVX512_SIMD16
89 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
90 #endif
91 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
92 #if ENABLE_AVX512_SIMD16
93 virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
94 #endif
95 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
96 virtual bool NextPrim() = 0;
97 virtual SIMDVERTEX& GetNextVsOutput() = 0;
98 virtual bool GetNextStreamOutput() = 0;
99 virtual SIMDMASK& GetNextVsIndices() = 0;
100 virtual uint32_t NumPrims() = 0;
101 virtual void Reset() = 0;
102 virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
103 };
104
105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
106 // output. Here is the sequence
107 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
108 // 2. Execute PA function to assemble and bin triangles.
109 // a. The PA function is a set of functions that collectively make up the
110 // state machine for a given topology.
111 // 1. We use a state index to track which PA function to call.
112 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
113 // 1. We call this the current and previous simd vertex.
114 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
115 // order to assemble the second triangle, for a triangle list, we'll need the
116 // last vertex from the previous simd and the first 2 vertices from the current simd.
117 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
118 //
119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
120 // cuts
121 struct PA_STATE_OPT : public PA_STATE
122 {
123 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
124 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
125
126 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
127
128 uint32_t cur{ 0 }; // index to current VS output.
129 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
130 const uint32_t first{ 0 }; // index to first VS output. Used for tri fan and line loop.
131
132 uint32_t counter{ 0 }; // state counter
133 bool reset{ false }; // reset state
134
135 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
136 SIMDSCALARI primID;
137
138 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
139 #if ENABLE_AVX512_SIMD16
140 typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
141 #endif
142 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
143
144 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
145 #if ENABLE_AVX512_SIMD16
146 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
147 #endif
148 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
149 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
150 #if ENABLE_AVX512_SIMD16
151 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
152 #endif
153
154 // state used to advance the PA when Next is called
155 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
156 #if ENABLE_AVX512_SIMD16
157 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
158 #endif
159 uint32_t nextNumSimdPrims{ 0 };
160 uint32_t nextNumPrimsIncrement{ 0 };
161 bool nextReset{ false };
162 bool isStreaming{ false };
163
164 SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function
165
166 PA_STATE_OPT() {}
167 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
168 uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
169
170 bool HasWork()
171 {
172 return (this->numPrimsComplete < this->numPrims) ? true : false;
173 }
174
175 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
176 {
177 SWR_ASSERT(slot < vertexStride);
178 uint32_t offset = index * vertexStride + slot;
179 simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
180 return vertexSlot;
181 }
182
183 #if ENABLE_AVX512_SIMD16
184 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
185 {
186 SWR_ASSERT(slot < vertexStride);
187 uint32_t offset = index * vertexStride + slot;
188 simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
189 return vertexSlot;
190 }
191
192 #endif
193 // Assembles 4 triangles. Each simdvector is a single vertex from 4
194 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
195 bool Assemble(uint32_t slot, simdvector verts[])
196 {
197 return this->pfnPaFunc(*this, slot, verts);
198 }
199
200 #if ENABLE_AVX512_SIMD16
201 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
202 {
203 return this->pfnPaFunc_simd16(*this, slot, verts);
204 }
205
206 #endif
207 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
208 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
209 {
210 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
211 }
212
213 bool NextPrim()
214 {
215 this->pfnPaFunc = this->pfnPaNextFunc;
216 #if ENABLE_AVX512_SIMD16
217 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
218 #endif
219 this->numSimdPrims = this->nextNumSimdPrims;
220 this->numPrimsComplete += this->nextNumPrimsIncrement;
221 this->reset = this->nextReset;
222
223 if (this->isStreaming)
224 {
225 this->reset = false;
226 }
227
228 bool morePrims = false;
229
230 if (this->numSimdPrims > 0)
231 {
232 morePrims = true;
233 this->numSimdPrims--;
234 }
235 else
236 {
237 this->counter = (this->reset) ? 0 : (this->counter + 1);
238 this->reset = false;
239 }
240
241 if (!HasWork())
242 {
243 morePrims = false; // no more to do
244 }
245
246 return morePrims;
247 }
248
249 SIMDVERTEX& GetNextVsOutput()
250 {
251 const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
252
253 // increment cur and prev indices
254 if (counter < numSimdVerts)
255 {
256 // prev undefined for first state
257 prev = cur;
258 cur = counter;
259 }
260 else
261 {
262 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
263 uint32_t temp = prev;
264
265 prev = cur;
266 cur = temp;
267 }
268
269 SWR_ASSERT(cur < numSimdVerts);
270 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
271
272 return *(SIMDVERTEX*)pVertex;
273 }
274
275 SIMDMASK& GetNextVsIndices()
276 {
277 // unused in optimized PA, pass tmp buffer back
278 return junkIndices;
279 }
280
281 bool GetNextStreamOutput()
282 {
283 this->prev = this->cur;
284 this->cur = this->counter;
285
286 return HasWork();
287 }
288
289 uint32_t NumPrims()
290 {
291 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
292 (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
293 }
294
295 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
296 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
297 uint32_t numSimdPrims = 0,
298 uint32_t numPrimsIncrement = 0,
299 bool reset = false)
300 {
301 this->pfnPaNextFunc = pfnPaNextFunc;
302 this->nextNumSimdPrims = numSimdPrims;
303 this->nextNumPrimsIncrement = numPrimsIncrement;
304 this->nextReset = reset;
305
306 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
307 }
308
309 #if ENABLE_AVX512_SIMD16
310 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
311 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
312 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
313 uint32_t numSimdPrims = 0,
314 uint32_t numPrimsIncrement = 0,
315 bool reset = false)
316 {
317 this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
318 this->pfnPaNextFunc = pfnPaNextFunc;
319 this->nextNumSimdPrims = numSimdPrims;
320 this->nextNumPrimsIncrement = numPrimsIncrement;
321 this->nextReset = reset;
322
323 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
324 }
325
326 #endif
327 void Reset()
328 {
329 #if ENABLE_AVX512_SIMD16
330 useAlternateOffset = false;
331
332 #endif
333 this->pfnPaFunc = this->pfnPaFuncReset;
334 #if ENABLE_AVX512_SIMD16
335 this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
336 #endif
337 this->numPrimsComplete = 0;
338 this->numSimdPrims = 0;
339 this->cur = 0;
340 this->prev = 0;
341 this->counter = 0;
342 this->reset = false;
343 }
344
345 SIMDSCALARI GetPrimID(uint32_t startID)
346 {
347 #if USE_SIMD16_FRONTEND
348 return _simd16_add_epi32(this->primID,
349 _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
350 #else
351 return _simd_add_epi32(this->primID,
352 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
353 #endif
354 }
355 };
356
357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
358 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
359 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
360 uint32_t numSimdPrims = 0,
361 uint32_t numPrimsIncrement = 0,
362 bool reset = false)
363 {
364 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
365 }
366
367 #if ENABLE_AVX512_SIMD16
368 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
369 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
370 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
371 uint32_t numSimdPrims = 0,
372 uint32_t numPrimsIncrement = 0,
373 bool reset = false)
374 {
375 return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
376 }
377
378 #endif
379 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
380 {
381 return pa.GetSimdVector(index, slot);
382 }
383
384 #if ENABLE_AVX512_SIMD16
385 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
386 {
387 return pa.GetSimdVector_simd16(index, slot);
388 }
389
390 #endif
391 // Cut-aware primitive assembler.
392 struct PA_STATE_CUT : public PA_STATE
393 {
394 SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
395 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
396 uint32_t numAttribs{ 0 }; // number of attributes
397 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
398 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
399 #if ENABLE_AVX512_SIMD16
400 OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
401 #else
402 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
403 #endif
404 SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
405 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
406 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
407 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
408 uint32_t curVertex{ 0 }; // current unprocessed vertex
409 uint32_t startPrimId{ 0 }; // starting prim id
410 SIMDSCALARI vPrimId; // vector of prim ID
411 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
412 uint32_t vertsPerPrim{ 0 };
413 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
414 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
415 // while the GS sends valid verts for every index
416
417 simdvector junkVector; // junk simdvector for unimplemented API
418 #if ENABLE_AVX512_SIMD16
419 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
420 #endif
421
422 // Topology state tracking
423 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
424 uint32_t curIndex{ 0 };
425 bool reverseWinding{ false }; // indicates reverse winding for strips
426 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
427
428 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
429 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
430
431 PA_STATE_CUT() {}
432 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
433 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
434 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
435 {
436 numVerts = in_streamSizeInVerts;
437 numAttribs = in_numAttribs;
438 binTopology = topo;
439 needOffsets = false;
440 processCutVerts = in_processCutVerts;
441
442 numVertsToAssemble = numRemainingVerts = in_numVerts;
443 numPrimsAssembled = 0;
444 headVertex = tailVertex = curVertex = 0;
445
446 curIndex = 0;
447 pCutIndices = in_pIndices;
448 memset(indices, 0, sizeof(indices));
449 #if USE_SIMD16_FRONTEND
450 vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
451 #else
452 vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
453 #endif
454 reverseWinding = false;
455 adjExtraVert = -1;
456
457 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
458 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
459
460 switch (topo)
461 {
462 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
463 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
464 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
465 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
466 {
467 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
468 }
469 else
470 {
471 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
472 }
473 break;
474
475 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
476 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
477 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
478 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
479 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
480 default: assert(0 && "Unimplemented topology");
481 }
482 }
483
484 SIMDVERTEX& GetNextVsOutput()
485 {
486 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
487 this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
488 this->needOffsets = true;
489 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
490
491 return *(SIMDVERTEX*)pVertex;
492 }
493
494 SIMDMASK& GetNextVsIndices()
495 {
496 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
497 SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
498 return *pCurCutIndex;
499 }
500
501 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
502 {
503 // unused
504 SWR_ASSERT(0 && "Not implemented");
505 return junkVector;
506 }
507
508 #if ENABLE_AVX512_SIMD16
509 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
510 {
511 // unused
512 SWR_ASSERT(0 && "Not implemented");
513 return junkVector_simd16;
514 }
515
516 #endif
517 bool GetNextStreamOutput()
518 {
519 this->headVertex += SIMD_WIDTH;
520 this->needOffsets = true;
521 return HasWork();
522 }
523
524 SIMDSCALARI GetPrimID(uint32_t startID)
525 {
526 #if USE_SIMD16_FRONTEND
527 return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
528 #else
529 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
530 #endif
531 }
532
533 void Reset()
534 {
535 #if ENABLE_AVX512_SIMD16
536 useAlternateOffset = false;
537
538 #endif
539 this->numRemainingVerts = this->numVertsToAssemble;
540 this->numPrimsAssembled = 0;
541 this->curIndex = 0;
542 this->curVertex = 0;
543 this->tailVertex = 0;
544 this->headVertex = 0;
545 this->reverseWinding = false;
546 this->adjExtraVert = -1;
547 #if USE_SIMD16_FRONTEND
548 this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
549 #else
550 this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
551 #endif
552 }
553
554 bool HasWork()
555 {
556 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
557 }
558
559 bool IsVertexStoreFull()
560 {
561 return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
562 }
563
564 void RestartTopology()
565 {
566 this->curIndex = 0;
567 this->reverseWinding = false;
568 this->adjExtraVert = -1;
569 }
570
571 bool IsCutIndex(uint32_t vertex)
572 {
573 uint32_t vertexIndex = vertex / SIMD_WIDTH;
574 uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
575 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
576 }
577
578 // iterates across the unprocessed verts until we hit the end or we
579 // have assembled SIMD prims
580 void ProcessVerts()
581 {
582 while (this->numPrimsAssembled != SIMD_WIDTH &&
583 this->numRemainingVerts > 0 &&
584 this->curVertex != this->headVertex)
585 {
586 // if cut index, restart topology
587 if (IsCutIndex(this->curVertex))
588 {
589 if (this->processCutVerts)
590 {
591 (this->*pfnPa)(this->curVertex, false);
592 }
593 // finish off tri strip w/ adj before restarting topo
594 if (this->adjExtraVert != -1)
595 {
596 (this->*pfnPa)(this->curVertex, true);
597 }
598 RestartTopology();
599 }
600 else
601 {
602 (this->*pfnPa)(this->curVertex, false);
603 }
604
605 this->curVertex++;
606 if (this->curVertex >= this->numVerts) {
607 this->curVertex = 0;
608 }
609 this->numRemainingVerts--;
610 }
611
612 // special case last primitive for tri strip w/ adj
613 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
614 {
615 (this->*pfnPa)(this->curVertex, true);
616 }
617 }
618
619 void Advance()
620 {
621 // done with current batch
622 // advance tail to the current unsubmitted vertex
623 this->tailVertex = this->curVertex;
624 this->numPrimsAssembled = 0;
625 #if USE_SIMD16_FRONTEND
626 this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
627 #else
628 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
629 #endif
630 }
631
632 bool NextPrim()
633 {
634 // if we've assembled enough prims, we can advance to the next set of verts
635 if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
636 {
637 Advance();
638 }
639 return false;
640 }
641
642 void ComputeOffsets()
643 {
644 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
645 {
646 uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
647 SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
648
649 // step to simdvertex batch
650 const uint32_t simdShift = SIMD_WIDTH_LOG2;
651 #if USE_SIMD16_FRONTEND
652 SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
653 this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
654 #else
655 SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
656 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
657 #endif
658
659 // step to index
660 const uint32_t simdMask = SIMD_WIDTH - 1;
661 #if USE_SIMD16_FRONTEND
662 SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
663 this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
664 #else
665 SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
666 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
667 #endif
668 }
669 }
670
671 bool Assemble(uint32_t slot, simdvector *verts)
672 {
673 // process any outstanding verts
674 ProcessVerts();
675
676 // return false if we don't have enough prims assembled
677 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
678 {
679 return false;
680 }
681
682 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
683 if (this->needOffsets)
684 {
685 ComputeOffsets();
686 this->needOffsets = false;
687 }
688
689 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
690 {
691 SIMDSCALARI offsets = this->vOffsets[v];
692
693 // step to attribute
694 #if USE_SIMD16_FRONTEND
695 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
696 #else
697 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
698 #endif
699
700 float* pBase = (float*)this->pStreamBase;
701 for (uint32_t c = 0; c < 4; ++c)
702 {
703 #if USE_SIMD16_FRONTEND
704 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
705
706 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
707 simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
708 verts[v].v[c] = t;
709 #else
710 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
711 #endif
712
713 // move base to next component
714 pBase += SIMD_WIDTH;
715 }
716 }
717
718 return true;
719 }
720
721 #if ENABLE_AVX512_SIMD16
722 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
723 {
724 // process any outstanding verts
725 ProcessVerts();
726
727 // return false if we don't have enough prims assembled
728 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
729 {
730 return false;
731 }
732
733 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
734 if (this->needOffsets)
735 {
736 ComputeOffsets();
737 this->needOffsets = false;
738 }
739
740 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
741 {
742 SIMDSCALARI offsets = this->vOffsets[v];
743
744 // step to attribute
745 #if USE_SIMD16_FRONTEND
746 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
747 #else
748 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
749 #endif
750
751 float* pBase = (float*)this->pStreamBase;
752 for (uint32_t c = 0; c < 4; ++c)
753 {
754 #if USE_SIMD16_FRONTEND
755 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
756 #else
757 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
758 #endif
759
760 // move base to next component
761 pBase += SIMD_WIDTH;
762 }
763 }
764
765 return true;
766 }
767
768 #endif
769 void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
770 {
771 // move to slot
772 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
773 {
774 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
775 #if USE_SIMD16_FRONTEND
776 uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
777 #else
778 uint32_t offset = pOffset[triIndex];
779 #endif
780 offset += sizeof(SIMDVECTOR) * slot;
781 float* pVert = (float*)&tri[v];
782 for (uint32_t c = 0; c < 4; ++c)
783 {
784 float* pComponent = (float*)(this->pStreamBase + offset);
785 pVert[c] = *pComponent;
786 offset += SIMD_WIDTH * sizeof(float);
787 }
788 }
789 }
790
791 uint32_t NumPrims()
792 {
793 return this->numPrimsAssembled;
794 }
795
796 // Per-topology functions
797 void ProcessVertTriStrip(uint32_t index, bool finish)
798 {
799 this->vert[this->curIndex] = index;
800 this->curIndex++;
801 if (this->curIndex == 3)
802 {
803 // assembled enough verts for prim, add to gather indices
804 this->indices[0][this->numPrimsAssembled] = this->vert[0];
805 if (reverseWinding)
806 {
807 this->indices[1][this->numPrimsAssembled] = this->vert[2];
808 this->indices[2][this->numPrimsAssembled] = this->vert[1];
809 }
810 else
811 {
812 this->indices[1][this->numPrimsAssembled] = this->vert[1];
813 this->indices[2][this->numPrimsAssembled] = this->vert[2];
814 }
815
816 // increment numPrimsAssembled
817 this->numPrimsAssembled++;
818
819 // set up next prim state
820 this->vert[0] = this->vert[1];
821 this->vert[1] = this->vert[2];
822 this->curIndex = 2;
823 this->reverseWinding ^= 1;
824 }
825 }
826
827 template<bool gsEnabled>
828 void AssembleTriStripAdj()
829 {
830 if (!gsEnabled)
831 {
832 this->vert[1] = this->vert[2];
833 this->vert[2] = this->vert[4];
834
835 this->indices[0][this->numPrimsAssembled] = this->vert[0];
836 this->indices[1][this->numPrimsAssembled] = this->vert[1];
837 this->indices[2][this->numPrimsAssembled] = this->vert[2];
838
839 this->vert[4] = this->vert[2];
840 this->vert[2] = this->vert[1];
841 }
842 else
843 {
844 this->indices[0][this->numPrimsAssembled] = this->vert[0];
845 this->indices[1][this->numPrimsAssembled] = this->vert[1];
846 this->indices[2][this->numPrimsAssembled] = this->vert[2];
847 this->indices[3][this->numPrimsAssembled] = this->vert[3];
848 this->indices[4][this->numPrimsAssembled] = this->vert[4];
849 this->indices[5][this->numPrimsAssembled] = this->vert[5];
850 }
851 this->numPrimsAssembled++;
852 }
853
854
855 template<bool gsEnabled>
856 void ProcessVertTriStripAdj(uint32_t index, bool finish)
857 {
858 // handle last primitive of tristrip
859 if (finish && this->adjExtraVert != -1)
860 {
861 this->vert[3] = this->adjExtraVert;
862 AssembleTriStripAdj<gsEnabled>();
863 this->adjExtraVert = -1;
864 return;
865 }
866
867 switch (this->curIndex)
868 {
869 case 0:
870 case 1:
871 case 2:
872 case 4:
873 this->vert[this->curIndex] = index;
874 this->curIndex++;
875 break;
876 case 3:
877 this->vert[5] = index;
878 this->curIndex++;
879 break;
880 case 5:
881 if (this->adjExtraVert == -1)
882 {
883 this->adjExtraVert = index;
884 }
885 else
886 {
887 this->vert[3] = index;
888 if (!gsEnabled)
889 {
890 AssembleTriStripAdj<gsEnabled>();
891
892 uint32_t nextTri[6];
893 if (this->reverseWinding)
894 {
895 nextTri[0] = this->vert[4];
896 nextTri[1] = this->vert[0];
897 nextTri[2] = this->vert[2];
898 nextTri[4] = this->vert[3];
899 nextTri[5] = this->adjExtraVert;
900 }
901 else
902 {
903 nextTri[0] = this->vert[2];
904 nextTri[1] = this->adjExtraVert;
905 nextTri[2] = this->vert[3];
906 nextTri[4] = this->vert[4];
907 nextTri[5] = this->vert[0];
908 }
909 for (uint32_t i = 0; i < 6; ++i)
910 {
911 this->vert[i] = nextTri[i];
912 }
913
914 this->adjExtraVert = -1;
915 this->reverseWinding ^= 1;
916 }
917 else
918 {
919 this->curIndex++;
920 }
921 }
922 break;
923 case 6:
924 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
925 AssembleTriStripAdj<gsEnabled>();
926
927 uint32_t nextTri[6];
928 if (this->reverseWinding)
929 {
930 nextTri[0] = this->vert[4];
931 nextTri[1] = this->vert[0];
932 nextTri[2] = this->vert[2];
933 nextTri[4] = this->vert[3];
934 nextTri[5] = this->adjExtraVert;
935 }
936 else
937 {
938 nextTri[0] = this->vert[2];
939 nextTri[1] = this->adjExtraVert;
940 nextTri[2] = this->vert[3];
941 nextTri[4] = this->vert[4];
942 nextTri[5] = this->vert[0];
943 }
944 for (uint32_t i = 0; i < 6; ++i)
945 {
946 this->vert[i] = nextTri[i];
947 }
948 this->reverseWinding ^= 1;
949 this->adjExtraVert = index;
950 this->curIndex--;
951 break;
952 }
953 }
954
955 void ProcessVertTriList(uint32_t index, bool finish)
956 {
957 this->vert[this->curIndex] = index;
958 this->curIndex++;
959 if (this->curIndex == 3)
960 {
961 // assembled enough verts for prim, add to gather indices
962 this->indices[0][this->numPrimsAssembled] = this->vert[0];
963 this->indices[1][this->numPrimsAssembled] = this->vert[1];
964 this->indices[2][this->numPrimsAssembled] = this->vert[2];
965
966 // increment numPrimsAssembled
967 this->numPrimsAssembled++;
968
969 // set up next prim state
970 this->curIndex = 0;
971 }
972 }
973
974 void ProcessVertTriListAdj(uint32_t index, bool finish)
975 {
976 this->vert[this->curIndex] = index;
977 this->curIndex++;
978 if (this->curIndex == 6)
979 {
980 // assembled enough verts for prim, add to gather indices
981 this->indices[0][this->numPrimsAssembled] = this->vert[0];
982 this->indices[1][this->numPrimsAssembled] = this->vert[1];
983 this->indices[2][this->numPrimsAssembled] = this->vert[2];
984 this->indices[3][this->numPrimsAssembled] = this->vert[3];
985 this->indices[4][this->numPrimsAssembled] = this->vert[4];
986 this->indices[5][this->numPrimsAssembled] = this->vert[5];
987
988 // increment numPrimsAssembled
989 this->numPrimsAssembled++;
990
991 // set up next prim state
992 this->curIndex = 0;
993 }
994 }
995
996 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
997 {
998 this->vert[this->curIndex] = index;
999 this->curIndex++;
1000 if (this->curIndex == 6)
1001 {
1002 // assembled enough verts for prim, add to gather indices
1003 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1004 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1005 this->indices[2][this->numPrimsAssembled] = this->vert[4];
1006
1007 // increment numPrimsAssembled
1008 this->numPrimsAssembled++;
1009
1010 // set up next prim state
1011 this->curIndex = 0;
1012 }
1013 }
1014
1015
1016 void ProcessVertLineList(uint32_t index, bool finish)
1017 {
1018 this->vert[this->curIndex] = index;
1019 this->curIndex++;
1020 if (this->curIndex == 2)
1021 {
1022 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1023 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1024
1025 this->numPrimsAssembled++;
1026 this->curIndex = 0;
1027 }
1028 }
1029
1030 void ProcessVertLineStrip(uint32_t index, bool finish)
1031 {
1032 this->vert[this->curIndex] = index;
1033 this->curIndex++;
1034 if (this->curIndex == 2)
1035 {
1036 // assembled enough verts for prim, add to gather indices
1037 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1038 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1039
1040 // increment numPrimsAssembled
1041 this->numPrimsAssembled++;
1042
1043 // set up next prim state
1044 this->vert[0] = this->vert[1];
1045 this->curIndex = 1;
1046 }
1047 }
1048
1049 void ProcessVertLineStripAdj(uint32_t index, bool finish)
1050 {
1051 this->vert[this->curIndex] = index;
1052 this->curIndex++;
1053 if (this->curIndex == 4)
1054 {
1055 // assembled enough verts for prim, add to gather indices
1056 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1057 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1058 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1059 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1060
1061 // increment numPrimsAssembled
1062 this->numPrimsAssembled++;
1063
1064 // set up next prim state
1065 this->vert[0] = this->vert[1];
1066 this->vert[1] = this->vert[2];
1067 this->vert[2] = this->vert[3];
1068 this->curIndex = 3;
1069 }
1070 }
1071
1072 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1073 {
1074 this->vert[this->curIndex] = index;
1075 this->curIndex++;
1076 if (this->curIndex == 4)
1077 {
1078 // assembled enough verts for prim, add to gather indices
1079 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1080 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1081
1082 // increment numPrimsAssembled
1083 this->numPrimsAssembled++;
1084
1085 // set up next prim state
1086 this->vert[0] = this->vert[1];
1087 this->vert[1] = this->vert[2];
1088 this->vert[2] = this->vert[3];
1089 this->curIndex = 3;
1090 }
1091 }
1092
1093 void ProcessVertLineListAdj(uint32_t index, bool finish)
1094 {
1095 this->vert[this->curIndex] = index;
1096 this->curIndex++;
1097 if (this->curIndex == 4)
1098 {
1099 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1100 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1101 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1102 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1103
1104 this->numPrimsAssembled++;
1105 this->curIndex = 0;
1106 }
1107 }
1108
1109 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1110 {
1111 this->vert[this->curIndex] = index;
1112 this->curIndex++;
1113 if (this->curIndex == 4)
1114 {
1115 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1116 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1117
1118 this->numPrimsAssembled++;
1119 this->curIndex = 0;
1120 }
1121 }
1122
1123 void ProcessVertPointList(uint32_t index, bool finish)
1124 {
1125 this->vert[this->curIndex] = index;
1126 this->curIndex++;
1127 if (this->curIndex == 1)
1128 {
1129 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1130 this->numPrimsAssembled++;
1131 this->curIndex = 0;
1132 }
1133 }
1134 };
1135
1136 // Primitive Assembly for data output from the DomainShader.
1137 struct PA_TESS : PA_STATE
1138 {
1139 PA_TESS(
1140 DRAW_CONTEXT *in_pDC,
1141 const SIMDSCALAR* in_pVertData,
1142 uint32_t in_attributeStrideInVectors,
1143 uint32_t in_vertexStride,
1144 uint32_t in_numAttributes,
1145 uint32_t* (&in_ppIndices)[3],
1146 uint32_t in_numPrims,
1147 PRIMITIVE_TOPOLOGY in_binTopology) :
1148
1149 PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
1150 m_pVertexData(in_pVertData),
1151 m_attributeStrideInVectors(in_attributeStrideInVectors),
1152 m_numAttributes(in_numAttributes),
1153 m_numPrims(in_numPrims)
1154 {
1155 #if USE_SIMD16_FRONTEND
1156 m_vPrimId = _simd16_setzero_si();
1157 #else
1158 m_vPrimId = _simd_setzero_si();
1159 #endif
1160 binTopology = in_binTopology;
1161 m_ppIndices[0] = in_ppIndices[0];
1162 m_ppIndices[1] = in_ppIndices[1];
1163 m_ppIndices[2] = in_ppIndices[2];
1164
1165 switch (binTopology)
1166 {
1167 case TOP_POINT_LIST:
1168 m_numVertsPerPrim = 1;
1169 break;
1170
1171 case TOP_LINE_LIST:
1172 m_numVertsPerPrim = 2;
1173 break;
1174
1175 case TOP_TRIANGLE_LIST:
1176 m_numVertsPerPrim = 3;
1177 break;
1178
1179 default:
1180 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1181 break;
1182 }
1183 }
1184
1185 bool HasWork()
1186 {
1187 return m_numPrims != 0;
1188 }
1189
1190 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1191 {
1192 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1193 return junkVector;
1194 }
1195
1196 #if ENABLE_AVX512_SIMD16
1197 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1198 {
1199 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1200 return junkVector_simd16;
1201 }
1202
1203 #endif
1204 static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1205 {
1206 SWR_ASSERT(numPrims <= SIMD_WIDTH);
1207 #if USE_SIMD16_FRONTEND
1208 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1209 {
1210 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1211 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1212 };
1213
1214 return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1215 #else
1216 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1217 {
1218 -1, -1, -1, -1, -1, -1, -1, -1,
1219 0, 0, 0, 0, 0, 0, 0, 0
1220 };
1221
1222 return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1223 #endif
1224 }
1225
1226 bool Assemble(uint32_t slot, simdvector verts[])
1227 {
1228 SWR_ASSERT(slot < m_numAttributes);
1229
1230 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1231 if (0 == numPrimsToAssemble)
1232 {
1233 return false;
1234 }
1235
1236 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1237
1238 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1239 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1240 {
1241 #if USE_SIMD16_FRONTEND
1242 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1243 #else
1244 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1245 #endif
1246
1247 const float* pBase = pBaseAttrib;
1248 for (uint32_t c = 0; c < 4; ++c)
1249 {
1250 #if USE_SIMD16_FRONTEND
1251 simd16scalar temp = _simd16_mask_i32gather_ps(
1252 _simd16_setzero_ps(),
1253 pBase,
1254 indices,
1255 _simd16_castsi_ps(mask),
1256 4 /* gcc doesn't like sizeof(float) */);
1257
1258 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1259 #else
1260 verts[i].v[c] = _simd_mask_i32gather_ps(
1261 _simd_setzero_ps(),
1262 pBase,
1263 indices,
1264 _simd_castsi_ps(mask),
1265 4); // gcc doesn't like sizeof(float)
1266 #endif
1267 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1268 }
1269 }
1270
1271 return true;
1272 }
1273
1274 #if ENABLE_AVX512_SIMD16
1275 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1276 {
1277 SWR_ASSERT(slot < m_numAttributes);
1278
1279 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1280 if (0 == numPrimsToAssemble)
1281 {
1282 return false;
1283 }
1284
1285 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1286
1287 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1288 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1289 {
1290 #if USE_SIMD16_FRONTEND
1291 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1292 #else
1293 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1294 #endif
1295
1296 const float* pBase = pBaseAttrib;
1297 for (uint32_t c = 0; c < 4; ++c)
1298 {
1299 #if USE_SIMD16_FRONTEND
1300 verts[i].v[c] = _simd16_mask_i32gather_ps(
1301 _simd16_setzero_ps(),
1302 pBase,
1303 indices,
1304 _simd16_castsi_ps(mask),
1305 4 /* gcc doesn't like sizeof(float) */);
1306 #else
1307 simdscalar temp = _simd_mask_i32gather_ps(
1308 _simd_setzero_ps(),
1309 pBase,
1310 indices,
1311 _simd_castsi_ps(mask),
1312 4 /* gcc doesn't like sizeof(float) */);
1313 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1314 #endif
1315 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1316 }
1317 }
1318
1319 return true;
1320 }
1321
1322 #endif
1323 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1324 {
1325 SWR_ASSERT(slot < m_numAttributes);
1326 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1327
1328 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1329 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1330 {
1331 #if USE_SIMD16_FRONTEND
1332 uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1333 #else
1334 uint32_t index = m_ppIndices[i][primIndex];
1335 #endif
1336 const float* pVertData = pVertDataBase;
1337 float* pVert = (float*)&verts[i];
1338
1339 for (uint32_t c = 0; c < 4; ++c)
1340 {
1341 pVert[c] = pVertData[index];
1342 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1343 }
1344 }
1345 }
1346
1347 bool NextPrim()
1348 {
1349 uint32_t numPrims = PA_TESS::NumPrims();
1350 m_numPrims -= numPrims;
1351 m_ppIndices[0] += numPrims;
1352 m_ppIndices[1] += numPrims;
1353 m_ppIndices[2] += numPrims;
1354
1355 return HasWork();
1356 }
1357
1358 SIMDVERTEX& GetNextVsOutput()
1359 {
1360 SWR_NOT_IMPL;
1361 return junkVertex;
1362 }
1363
1364 bool GetNextStreamOutput()
1365 {
1366 SWR_NOT_IMPL;
1367 return false;
1368 }
1369
1370 SIMDMASK& GetNextVsIndices()
1371 {
1372 SWR_NOT_IMPL;
1373 return junkIndices;
1374 }
1375
1376 uint32_t NumPrims()
1377 {
1378 return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1379 }
1380
1381 void Reset()
1382 {
1383 SWR_NOT_IMPL;
1384 }
1385
1386 SIMDSCALARI GetPrimID(uint32_t startID)
1387 {
1388 #if USE_SIMD16_FRONTEND
1389 return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1390 #else
1391 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1392 #endif
1393 }
1394
1395 private:
1396 const SIMDSCALAR* m_pVertexData = nullptr;
1397 uint32_t m_attributeStrideInVectors = 0;
1398 uint32_t m_numAttributes = 0;
1399 uint32_t m_numPrims = 0;
1400 uint32_t* m_ppIndices[3];
1401
1402 uint32_t m_numVertsPerPrim = 0;
1403
1404 SIMDSCALARI m_vPrimId;
1405
1406 simdvector junkVector; // junk simdvector for unimplemented API
1407 #if ENABLE_AVX512_SIMD16
1408 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
1409 #endif
1410 SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
1411 SIMDMASK junkIndices; // temporary index store for unused virtual function
1412 };
1413
1414 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1415 // based on state.
1416 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1417 struct PA_FACTORY
1418 {
1419 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
1420 {
1421 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1422 const API_STATE& state = GetApiState(pDC);
1423 if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1424 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1425 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1426 topo == TOP_TRIANGLE_LIST)) ||
1427
1428 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1429 // for them in the optimized PA
1430 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1431 {
1432 memset(&indexStore, 0, sizeof(indexStore));
1433 uint32_t numAttribs = state.feNumAttributes;
1434
1435 new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1436 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1437 cutPA = true;
1438 }
1439 else
1440 #endif
1441 {
1442 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1443 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
1444 cutPA = false;
1445 }
1446
1447 }
1448
1449 PA_STATE& GetPA()
1450 {
1451 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1452 if (cutPA)
1453 {
1454 return this->paCut;
1455 }
1456 else
1457 #endif
1458 {
1459 return this->paOpt;
1460 }
1461 }
1462
1463 PA_STATE_OPT paOpt;
1464 PA_STATE_CUT paCut;
1465
1466 bool cutPA{ false };
1467
1468 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1469
1470 PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1471 };