swr/rast: Add support to PA for variable sized vertices
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38 enum
39 {
40 SIMD_WIDTH = KNOB_SIMD16_WIDTH,
41 SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42 SIMD_WIDTH_LOG2 = 4
43 };
44
45 typedef simd16mask SIMDMASK;
46
47 typedef simd16scalar SIMDSCALAR;
48 typedef simd16vector SIMDVECTOR;
49 typedef simd16vertex SIMDVERTEX;
50
51 typedef simd16scalari SIMDSCALARI;
52
53 #else
54 enum
55 {
56 SIMD_WIDTH = KNOB_SIMD_WIDTH,
57 SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58 SIMD_WIDTH_LOG2 = 3
59 };
60
61 typedef simdmask SIMDMASK;
62
63 typedef simdscalar SIMDSCALAR;
64 typedef simdvector SIMDVECTOR;
65 typedef simdvertex SIMDVERTEX;
66
67 typedef simdscalari SIMDSCALARI;
68
69 #endif
70 DRAW_CONTEXT *pDC{ nullptr }; // draw context
71 uint8_t* pStreamBase{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
73 uint32_t vertexStride{ 0 }; // stride of a vertex in simdvector units
74
75 // The topology the binner will use. In some cases the FE changes the topology from the api state.
76 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
77
78 #if ENABLE_AVX512_SIMD16
79 bool useAlternateOffset{ false };
80
81 #endif
82 PA_STATE() {}
83 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
84 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
85
86 virtual bool HasWork() = 0;
87 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
88 #if ENABLE_AVX512_SIMD16
89 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
90 #endif
91 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
92 #if ENABLE_AVX512_SIMD16
93 virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
94 #endif
95 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
96 virtual bool NextPrim() = 0;
97 virtual SIMDVERTEX& GetNextVsOutput() = 0;
98 virtual bool GetNextStreamOutput() = 0;
99 virtual SIMDMASK& GetNextVsIndices() = 0;
100 virtual uint32_t NumPrims() = 0;
101 virtual void Reset() = 0;
102 virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
103 };
104
105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
106 // output. Here is the sequence
107 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
108 // 2. Execute PA function to assemble and bin triangles.
109 // a. The PA function is a set of functions that collectively make up the
110 // state machine for a given topology.
111 // 1. We use a state index to track which PA function to call.
112 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
113 // 1. We call this the current and previous simd vertex.
114 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
115 // order to assemble the second triangle, for a triangle list, we'll need the
116 // last vertex from the previous simd and the first 2 vertices from the current simd.
117 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
118 //
119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
120 // cuts
121 struct PA_STATE_OPT : public PA_STATE
122 {
123 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
124 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
125
126 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
127
128 uint32_t cur{ 0 }; // index to current VS output.
129 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
130 const uint32_t first{ 0 }; // index to first VS output. Used for tri fan and line loop.
131
132 uint32_t counter{ 0 }; // state counter
133 bool reset{ false }; // reset state
134
135 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
136 SIMDSCALARI primID;
137
138 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
139 #if ENABLE_AVX512_SIMD16
140 typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
141 #endif
142 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
143
144 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
145 #if ENABLE_AVX512_SIMD16
146 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
147 #endif
148 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
149 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
150 #if ENABLE_AVX512_SIMD16
151 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
152 #endif
153
154 // state used to advance the PA when Next is called
155 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
156 #if ENABLE_AVX512_SIMD16
157 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
158 #endif
159 uint32_t nextNumSimdPrims{ 0 };
160 uint32_t nextNumPrimsIncrement{ 0 };
161 bool nextReset{ false };
162 bool isStreaming{ false };
163
164 SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function
165
166 PA_STATE_OPT() {}
167 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
168 uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
169
170 bool HasWork()
171 {
172 return (this->numPrimsComplete < this->numPrims) ? true : false;
173 }
174
175 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
176 {
177 SWR_ASSERT(slot < vertexStride);
178 uint32_t offset = index * vertexStride + slot;
179 simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
180 return vertexSlot;
181 }
182
183 #if ENABLE_AVX512_SIMD16
184 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
185 {
186 SWR_ASSERT(slot < vertexStride);
187 uint32_t offset = index * vertexStride + slot;
188 simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
189 return vertexSlot;
190 }
191
192 #endif
193 // Assembles 4 triangles. Each simdvector is a single vertex from 4
194 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
195 bool Assemble(uint32_t slot, simdvector verts[])
196 {
197 return this->pfnPaFunc(*this, slot, verts);
198 }
199
200 #if ENABLE_AVX512_SIMD16
201 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
202 {
203 return this->pfnPaFunc_simd16(*this, slot, verts);
204 }
205
206 #endif
207 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
208 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
209 {
210 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
211 }
212
213 bool NextPrim()
214 {
215 this->pfnPaFunc = this->pfnPaNextFunc;
216 #if ENABLE_AVX512_SIMD16
217 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
218 #endif
219 this->numSimdPrims = this->nextNumSimdPrims;
220 this->numPrimsComplete += this->nextNumPrimsIncrement;
221 this->reset = this->nextReset;
222
223 if (this->isStreaming)
224 {
225 this->reset = false;
226 }
227
228 bool morePrims = false;
229
230 if (this->numSimdPrims > 0)
231 {
232 morePrims = true;
233 this->numSimdPrims--;
234 }
235 else
236 {
237 this->counter = (this->reset) ? 0 : (this->counter + 1);
238 this->reset = false;
239 }
240
241 if (!HasWork())
242 {
243 morePrims = false; // no more to do
244 }
245
246 return morePrims;
247 }
248
249 SIMDVERTEX& GetNextVsOutput()
250 {
251 const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
252
253 // increment cur and prev indices
254 if (counter < numSimdVerts)
255 {
256 // prev undefined for first state
257 prev = cur;
258 cur = counter;
259 }
260 else
261 {
262 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
263 uint32_t temp = prev;
264
265 prev = cur;
266 cur = temp;
267 }
268
269 SWR_ASSERT(cur < numSimdVerts);
270 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
271
272 return *(SIMDVERTEX*)pVertex;
273 }
274
275 SIMDMASK& GetNextVsIndices()
276 {
277 // unused in optimized PA, pass tmp buffer back
278 return junkIndices;
279 }
280
281 bool GetNextStreamOutput()
282 {
283 this->prev = this->cur;
284 this->cur = this->counter;
285
286 return HasWork();
287 }
288
289 uint32_t NumPrims()
290 {
291 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
292 (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
293 }
294
295 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
296 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
297 uint32_t numSimdPrims = 0,
298 uint32_t numPrimsIncrement = 0,
299 bool reset = false)
300 {
301 this->pfnPaNextFunc = pfnPaNextFunc;
302 this->nextNumSimdPrims = numSimdPrims;
303 this->nextNumPrimsIncrement = numPrimsIncrement;
304 this->nextReset = reset;
305
306 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
307 }
308
309 #if ENABLE_AVX512_SIMD16
310 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
311 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
312 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
313 uint32_t numSimdPrims = 0,
314 uint32_t numPrimsIncrement = 0,
315 bool reset = false)
316 {
317 this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
318 this->pfnPaNextFunc = pfnPaNextFunc;
319 this->nextNumSimdPrims = numSimdPrims;
320 this->nextNumPrimsIncrement = numPrimsIncrement;
321 this->nextReset = reset;
322
323 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
324 }
325
326 #endif
327 void Reset()
328 {
329 #if ENABLE_AVX512_SIMD16
330 useAlternateOffset = false;
331
332 #endif
333 this->pfnPaFunc = this->pfnPaFuncReset;
334 #if ENABLE_AVX512_SIMD16
335 this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
336 #endif
337 this->numPrimsComplete = 0;
338 this->numSimdPrims = 0;
339 this->cur = 0;
340 this->prev = 0;
341 this->counter = 0;
342 this->reset = false;
343 }
344
345 SIMDSCALARI GetPrimID(uint32_t startID)
346 {
347 #if USE_SIMD16_FRONTEND
348 return _simd16_add_epi32(this->primID,
349 _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
350 #else
351 return _simd_add_epi32(this->primID,
352 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
353 #endif
354 }
355 };
356
357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
358 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
359 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
360 uint32_t numSimdPrims = 0,
361 uint32_t numPrimsIncrement = 0,
362 bool reset = false)
363 {
364 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
365 }
366
367 #if ENABLE_AVX512_SIMD16
368 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
369 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
370 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
371 uint32_t numSimdPrims = 0,
372 uint32_t numPrimsIncrement = 0,
373 bool reset = false)
374 {
375 return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
376 }
377
378 #endif
379 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
380 {
381 return pa.GetSimdVector(index, slot);
382 }
383
384 #if ENABLE_AVX512_SIMD16
385 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
386 {
387 return pa.GetSimdVector_simd16(index, slot);
388 }
389
390 #endif
391 // Cut-aware primitive assembler.
392 struct PA_STATE_CUT : public PA_STATE
393 {
394 SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
395 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
396 uint32_t numAttribs{ 0 }; // number of attributes
397 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
398 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
399 #if ENABLE_AVX512_SIMD16
400 OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
401 #else
402 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
403 #endif
404 SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
405 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
406 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
407 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
408 uint32_t curVertex{ 0 }; // current unprocessed vertex
409 uint32_t startPrimId{ 0 }; // starting prim id
410 SIMDSCALARI vPrimId; // vector of prim ID
411 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
412 uint32_t vertsPerPrim{ 0 };
413 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
414 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
415 // while the GS sends valid verts for every index
416
417 simdvector junkVector; // junk simdvector for unimplemented API
418 #if ENABLE_AVX512_SIMD16
419 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
420 #endif
421
422 // Topology state tracking
423 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
424 uint32_t curIndex{ 0 };
425 bool reverseWinding{ false }; // indicates reverse winding for strips
426 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
427
428 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
429 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
430
431 PA_STATE_CUT() {}
432 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
433 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
434 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
435 {
436 numVerts = in_streamSizeInVerts;
437 numAttribs = in_numAttribs;
438 binTopology = topo;
439 needOffsets = false;
440 processCutVerts = in_processCutVerts;
441
442 numVertsToAssemble = numRemainingVerts = in_numVerts;
443 numPrimsAssembled = 0;
444 headVertex = tailVertex = curVertex = 0;
445
446 curIndex = 0;
447 pCutIndices = in_pIndices;
448 memset(indices, 0, sizeof(indices));
449 #if USE_SIMD16_FRONTEND
450 vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
451 #else
452 vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
453 #endif
454 reverseWinding = false;
455 adjExtraVert = -1;
456
457 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
458 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
459
460 switch (topo)
461 {
462 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
463 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
464 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
465 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
466 {
467 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
468 }
469 else
470 {
471 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
472 }
473 break;
474
475 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
476 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
477 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
478 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
479 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
480 default: assert(0 && "Unimplemented topology");
481 }
482 }
483
484 SIMDVERTEX& GetNextVsOutput()
485 {
486 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
487 this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
488 this->needOffsets = true;
489 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
490
491 return *(SIMDVERTEX*)pVertex;
492 }
493
494 SIMDMASK& GetNextVsIndices()
495 {
496 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
497 SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
498 return *pCurCutIndex;
499 }
500
501 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
502 {
503 // unused
504 SWR_ASSERT(0 && "Not implemented");
505 return junkVector;
506 }
507
508 #if ENABLE_AVX512_SIMD16
509 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
510 {
511 // unused
512 SWR_ASSERT(0 && "Not implemented");
513 return junkVector_simd16;
514 }
515
516 #endif
517 bool GetNextStreamOutput()
518 {
519 this->headVertex += SIMD_WIDTH;
520 this->needOffsets = true;
521 return HasWork();
522 }
523
524 SIMDSCALARI GetPrimID(uint32_t startID)
525 {
526 #if USE_SIMD16_FRONTEND
527 return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
528 #else
529 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
530 #endif
531 }
532
533 void Reset()
534 {
535 #if ENABLE_AVX512_SIMD16
536 useAlternateOffset = false;
537
538 #endif
539 this->numRemainingVerts = this->numVertsToAssemble;
540 this->numPrimsAssembled = 0;
541 this->curIndex = 0;
542 this->curVertex = 0;
543 this->tailVertex = 0;
544 this->headVertex = 0;
545 this->reverseWinding = false;
546 this->adjExtraVert = -1;
547 #if USE_SIMD16_FRONTEND
548 this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
549 #else
550 this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
551 #endif
552 }
553
554 bool HasWork()
555 {
556 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
557 }
558
559 bool IsVertexStoreFull()
560 {
561 return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
562 }
563
564 void RestartTopology()
565 {
566 this->curIndex = 0;
567 this->reverseWinding = false;
568 this->adjExtraVert = -1;
569 }
570
571 bool IsCutIndex(uint32_t vertex)
572 {
573 uint32_t vertexIndex = vertex / SIMD_WIDTH;
574 uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
575 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
576 }
577
578 // iterates across the unprocessed verts until we hit the end or we
579 // have assembled SIMD prims
580 void ProcessVerts()
581 {
582 while (this->numPrimsAssembled != SIMD_WIDTH &&
583 this->numRemainingVerts > 0 &&
584 this->curVertex != this->headVertex)
585 {
586 // if cut index, restart topology
587 if (IsCutIndex(this->curVertex))
588 {
589 if (this->processCutVerts)
590 {
591 (this->*pfnPa)(this->curVertex, false);
592 }
593 // finish off tri strip w/ adj before restarting topo
594 if (this->adjExtraVert != -1)
595 {
596 (this->*pfnPa)(this->curVertex, true);
597 }
598 RestartTopology();
599 }
600 else
601 {
602 (this->*pfnPa)(this->curVertex, false);
603 }
604
605 this->curVertex++;
606 if (this->curVertex >= this->numVerts) {
607 this->curVertex = 0;
608 }
609 this->numRemainingVerts--;
610 }
611
612 // special case last primitive for tri strip w/ adj
613 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
614 {
615 (this->*pfnPa)(this->curVertex, true);
616 }
617 }
618
619 void Advance()
620 {
621 // done with current batch
622 // advance tail to the current unsubmitted vertex
623 this->tailVertex = this->curVertex;
624 this->numPrimsAssembled = 0;
625 #if USE_SIMD16_FRONTEND
626 this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
627 #else
628 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
629 #endif
630 }
631
632 bool NextPrim()
633 {
634 // if we've assembled enough prims, we can advance to the next set of verts
635 if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
636 {
637 Advance();
638 }
639 return false;
640 }
641
642 void ComputeOffsets()
643 {
644 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
645 {
646 uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
647 SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
648
649 // step to simdvertex batch
650 const uint32_t simdShift = SIMD_WIDTH_LOG2;
651 #if USE_SIMD16_FRONTEND
652 SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
653 this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
654 #else
655 SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
656 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
657 #endif
658
659 // step to index
660 const uint32_t simdMask = SIMD_WIDTH - 1;
661 #if USE_SIMD16_FRONTEND
662 SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
663 this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
664 #else
665 SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
666 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
667 #endif
668 }
669 }
670
671 // disabling buffer overrun warning for this function for what appears to be a bug in MSVC 2017
672 PRAGMA_WARNING_PUSH_DISABLE(4789)
673 bool Assemble(uint32_t slot, simdvector *verts)
674 {
675 // process any outstanding verts
676 ProcessVerts();
677
678 // return false if we don't have enough prims assembled
679 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
680 {
681 return false;
682 }
683
684 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
685 if (this->needOffsets)
686 {
687 ComputeOffsets();
688 this->needOffsets = false;
689 }
690
691 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
692 {
693 SIMDSCALARI offsets = this->vOffsets[v];
694
695 // step to attribute
696 #if USE_SIMD16_FRONTEND
697 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
698 #else
699 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
700 #endif
701
702 float* pBase = (float*)this->pStreamBase;
703 for (uint32_t c = 0; c < 4; ++c)
704 {
705 #if USE_SIMD16_FRONTEND
706 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
707
708 verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
709 #else
710 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
711 #endif
712
713 // move base to next component
714 pBase += SIMD_WIDTH;
715 }
716 }
717
718 return true;
719 }
720 PRAGMA_WARNING_POP()
721
722 #if ENABLE_AVX512_SIMD16
723 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
724 {
725 // process any outstanding verts
726 ProcessVerts();
727
728 // return false if we don't have enough prims assembled
729 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
730 {
731 return false;
732 }
733
734 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
735 if (this->needOffsets)
736 {
737 ComputeOffsets();
738 this->needOffsets = false;
739 }
740
741 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
742 {
743 SIMDSCALARI offsets = this->vOffsets[v];
744
745 // step to attribute
746 #if USE_SIMD16_FRONTEND
747 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
748 #else
749 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
750 #endif
751
752 float* pBase = (float*)this->pStreamBase;
753 for (uint32_t c = 0; c < 4; ++c)
754 {
755 #if USE_SIMD16_FRONTEND
756 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
757 #else
758 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
759 #endif
760
761 // move base to next component
762 pBase += SIMD_WIDTH;
763 }
764 }
765
766 return true;
767 }
768
769 #endif
770 void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
771 {
772 // move to slot
773 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
774 {
775 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
776 #if USE_SIMD16_FRONTEND
777 uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
778 #else
779 uint32_t offset = pOffset[triIndex];
780 #endif
781 offset += sizeof(SIMDVECTOR) * slot;
782 float* pVert = (float*)&tri[v];
783 for (uint32_t c = 0; c < 4; ++c)
784 {
785 float* pComponent = (float*)(this->pStreamBase + offset);
786 pVert[c] = *pComponent;
787 offset += SIMD_WIDTH * sizeof(float);
788 }
789 }
790 }
791
792 uint32_t NumPrims()
793 {
794 return this->numPrimsAssembled;
795 }
796
797 // Per-topology functions
798 void ProcessVertTriStrip(uint32_t index, bool finish)
799 {
800 this->vert[this->curIndex] = index;
801 this->curIndex++;
802 if (this->curIndex == 3)
803 {
804 // assembled enough verts for prim, add to gather indices
805 this->indices[0][this->numPrimsAssembled] = this->vert[0];
806 if (reverseWinding)
807 {
808 this->indices[1][this->numPrimsAssembled] = this->vert[2];
809 this->indices[2][this->numPrimsAssembled] = this->vert[1];
810 }
811 else
812 {
813 this->indices[1][this->numPrimsAssembled] = this->vert[1];
814 this->indices[2][this->numPrimsAssembled] = this->vert[2];
815 }
816
817 // increment numPrimsAssembled
818 this->numPrimsAssembled++;
819
820 // set up next prim state
821 this->vert[0] = this->vert[1];
822 this->vert[1] = this->vert[2];
823 this->curIndex = 2;
824 this->reverseWinding ^= 1;
825 }
826 }
827
828 template<bool gsEnabled>
829 void AssembleTriStripAdj()
830 {
831 if (!gsEnabled)
832 {
833 this->vert[1] = this->vert[2];
834 this->vert[2] = this->vert[4];
835
836 this->indices[0][this->numPrimsAssembled] = this->vert[0];
837 this->indices[1][this->numPrimsAssembled] = this->vert[1];
838 this->indices[2][this->numPrimsAssembled] = this->vert[2];
839
840 this->vert[4] = this->vert[2];
841 this->vert[2] = this->vert[1];
842 }
843 else
844 {
845 this->indices[0][this->numPrimsAssembled] = this->vert[0];
846 this->indices[1][this->numPrimsAssembled] = this->vert[1];
847 this->indices[2][this->numPrimsAssembled] = this->vert[2];
848 this->indices[3][this->numPrimsAssembled] = this->vert[3];
849 this->indices[4][this->numPrimsAssembled] = this->vert[4];
850 this->indices[5][this->numPrimsAssembled] = this->vert[5];
851 }
852 this->numPrimsAssembled++;
853 }
854
855
856 template<bool gsEnabled>
857 void ProcessVertTriStripAdj(uint32_t index, bool finish)
858 {
859 // handle last primitive of tristrip
860 if (finish && this->adjExtraVert != -1)
861 {
862 this->vert[3] = this->adjExtraVert;
863 AssembleTriStripAdj<gsEnabled>();
864 this->adjExtraVert = -1;
865 return;
866 }
867
868 switch (this->curIndex)
869 {
870 case 0:
871 case 1:
872 case 2:
873 case 4:
874 this->vert[this->curIndex] = index;
875 this->curIndex++;
876 break;
877 case 3:
878 this->vert[5] = index;
879 this->curIndex++;
880 break;
881 case 5:
882 if (this->adjExtraVert == -1)
883 {
884 this->adjExtraVert = index;
885 }
886 else
887 {
888 this->vert[3] = index;
889 if (!gsEnabled)
890 {
891 AssembleTriStripAdj<gsEnabled>();
892
893 uint32_t nextTri[6];
894 if (this->reverseWinding)
895 {
896 nextTri[0] = this->vert[4];
897 nextTri[1] = this->vert[0];
898 nextTri[2] = this->vert[2];
899 nextTri[4] = this->vert[3];
900 nextTri[5] = this->adjExtraVert;
901 }
902 else
903 {
904 nextTri[0] = this->vert[2];
905 nextTri[1] = this->adjExtraVert;
906 nextTri[2] = this->vert[3];
907 nextTri[4] = this->vert[4];
908 nextTri[5] = this->vert[0];
909 }
910 for (uint32_t i = 0; i < 6; ++i)
911 {
912 this->vert[i] = nextTri[i];
913 }
914
915 this->adjExtraVert = -1;
916 this->reverseWinding ^= 1;
917 }
918 else
919 {
920 this->curIndex++;
921 }
922 }
923 break;
924 case 6:
925 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
926 AssembleTriStripAdj<gsEnabled>();
927
928 uint32_t nextTri[6];
929 if (this->reverseWinding)
930 {
931 nextTri[0] = this->vert[4];
932 nextTri[1] = this->vert[0];
933 nextTri[2] = this->vert[2];
934 nextTri[4] = this->vert[3];
935 nextTri[5] = this->adjExtraVert;
936 }
937 else
938 {
939 nextTri[0] = this->vert[2];
940 nextTri[1] = this->adjExtraVert;
941 nextTri[2] = this->vert[3];
942 nextTri[4] = this->vert[4];
943 nextTri[5] = this->vert[0];
944 }
945 for (uint32_t i = 0; i < 6; ++i)
946 {
947 this->vert[i] = nextTri[i];
948 }
949 this->reverseWinding ^= 1;
950 this->adjExtraVert = index;
951 this->curIndex--;
952 break;
953 }
954 }
955
956 void ProcessVertTriList(uint32_t index, bool finish)
957 {
958 this->vert[this->curIndex] = index;
959 this->curIndex++;
960 if (this->curIndex == 3)
961 {
962 // assembled enough verts for prim, add to gather indices
963 this->indices[0][this->numPrimsAssembled] = this->vert[0];
964 this->indices[1][this->numPrimsAssembled] = this->vert[1];
965 this->indices[2][this->numPrimsAssembled] = this->vert[2];
966
967 // increment numPrimsAssembled
968 this->numPrimsAssembled++;
969
970 // set up next prim state
971 this->curIndex = 0;
972 }
973 }
974
975 void ProcessVertTriListAdj(uint32_t index, bool finish)
976 {
977 this->vert[this->curIndex] = index;
978 this->curIndex++;
979 if (this->curIndex == 6)
980 {
981 // assembled enough verts for prim, add to gather indices
982 this->indices[0][this->numPrimsAssembled] = this->vert[0];
983 this->indices[1][this->numPrimsAssembled] = this->vert[1];
984 this->indices[2][this->numPrimsAssembled] = this->vert[2];
985 this->indices[3][this->numPrimsAssembled] = this->vert[3];
986 this->indices[4][this->numPrimsAssembled] = this->vert[4];
987 this->indices[5][this->numPrimsAssembled] = this->vert[5];
988
989 // increment numPrimsAssembled
990 this->numPrimsAssembled++;
991
992 // set up next prim state
993 this->curIndex = 0;
994 }
995 }
996
997 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
998 {
999 this->vert[this->curIndex] = index;
1000 this->curIndex++;
1001 if (this->curIndex == 6)
1002 {
1003 // assembled enough verts for prim, add to gather indices
1004 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1005 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1006 this->indices[2][this->numPrimsAssembled] = this->vert[4];
1007
1008 // increment numPrimsAssembled
1009 this->numPrimsAssembled++;
1010
1011 // set up next prim state
1012 this->curIndex = 0;
1013 }
1014 }
1015
1016
1017 void ProcessVertLineList(uint32_t index, bool finish)
1018 {
1019 this->vert[this->curIndex] = index;
1020 this->curIndex++;
1021 if (this->curIndex == 2)
1022 {
1023 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1024 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1025
1026 this->numPrimsAssembled++;
1027 this->curIndex = 0;
1028 }
1029 }
1030
1031 void ProcessVertLineStrip(uint32_t index, bool finish)
1032 {
1033 this->vert[this->curIndex] = index;
1034 this->curIndex++;
1035 if (this->curIndex == 2)
1036 {
1037 // assembled enough verts for prim, add to gather indices
1038 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1039 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1040
1041 // increment numPrimsAssembled
1042 this->numPrimsAssembled++;
1043
1044 // set up next prim state
1045 this->vert[0] = this->vert[1];
1046 this->curIndex = 1;
1047 }
1048 }
1049
1050 void ProcessVertLineStripAdj(uint32_t index, bool finish)
1051 {
1052 this->vert[this->curIndex] = index;
1053 this->curIndex++;
1054 if (this->curIndex == 4)
1055 {
1056 // assembled enough verts for prim, add to gather indices
1057 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1058 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1059 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1060 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1061
1062 // increment numPrimsAssembled
1063 this->numPrimsAssembled++;
1064
1065 // set up next prim state
1066 this->vert[0] = this->vert[1];
1067 this->vert[1] = this->vert[2];
1068 this->vert[2] = this->vert[3];
1069 this->curIndex = 3;
1070 }
1071 }
1072
1073 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1074 {
1075 this->vert[this->curIndex] = index;
1076 this->curIndex++;
1077 if (this->curIndex == 4)
1078 {
1079 // assembled enough verts for prim, add to gather indices
1080 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1081 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1082
1083 // increment numPrimsAssembled
1084 this->numPrimsAssembled++;
1085
1086 // set up next prim state
1087 this->vert[0] = this->vert[1];
1088 this->vert[1] = this->vert[2];
1089 this->vert[2] = this->vert[3];
1090 this->curIndex = 3;
1091 }
1092 }
1093
1094 void ProcessVertLineListAdj(uint32_t index, bool finish)
1095 {
1096 this->vert[this->curIndex] = index;
1097 this->curIndex++;
1098 if (this->curIndex == 4)
1099 {
1100 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1101 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1102 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1103 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1104
1105 this->numPrimsAssembled++;
1106 this->curIndex = 0;
1107 }
1108 }
1109
1110 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1111 {
1112 this->vert[this->curIndex] = index;
1113 this->curIndex++;
1114 if (this->curIndex == 4)
1115 {
1116 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1117 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1118
1119 this->numPrimsAssembled++;
1120 this->curIndex = 0;
1121 }
1122 }
1123
1124 void ProcessVertPointList(uint32_t index, bool finish)
1125 {
1126 this->vert[this->curIndex] = index;
1127 this->curIndex++;
1128 if (this->curIndex == 1)
1129 {
1130 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1131 this->numPrimsAssembled++;
1132 this->curIndex = 0;
1133 }
1134 }
1135 };
1136
1137 // Primitive Assembly for data output from the DomainShader.
1138 struct PA_TESS : PA_STATE
1139 {
1140 PA_TESS(
1141 DRAW_CONTEXT *in_pDC,
1142 const SIMDSCALAR* in_pVertData,
1143 uint32_t in_attributeStrideInVectors,
1144 uint32_t in_vertexStride,
1145 uint32_t in_numAttributes,
1146 uint32_t* (&in_ppIndices)[3],
1147 uint32_t in_numPrims,
1148 PRIMITIVE_TOPOLOGY in_binTopology) :
1149
1150 PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
1151 m_pVertexData(in_pVertData),
1152 m_attributeStrideInVectors(in_attributeStrideInVectors),
1153 m_numAttributes(in_numAttributes),
1154 m_numPrims(in_numPrims)
1155 {
1156 #if USE_SIMD16_FRONTEND
1157 m_vPrimId = _simd16_setzero_si();
1158 #else
1159 m_vPrimId = _simd_setzero_si();
1160 #endif
1161 binTopology = in_binTopology;
1162 m_ppIndices[0] = in_ppIndices[0];
1163 m_ppIndices[1] = in_ppIndices[1];
1164 m_ppIndices[2] = in_ppIndices[2];
1165
1166 switch (binTopology)
1167 {
1168 case TOP_POINT_LIST:
1169 m_numVertsPerPrim = 1;
1170 break;
1171
1172 case TOP_LINE_LIST:
1173 m_numVertsPerPrim = 2;
1174 break;
1175
1176 case TOP_TRIANGLE_LIST:
1177 m_numVertsPerPrim = 3;
1178 break;
1179
1180 default:
1181 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1182 break;
1183 }
1184 }
1185
1186 bool HasWork()
1187 {
1188 return m_numPrims != 0;
1189 }
1190
1191 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1192 {
1193 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1194 return junkVector;
1195 }
1196
1197 #if ENABLE_AVX512_SIMD16
1198 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1199 {
1200 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1201 return junkVector_simd16;
1202 }
1203
1204 #endif
1205 static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1206 {
1207 SWR_ASSERT(numPrims <= SIMD_WIDTH);
1208 #if USE_SIMD16_FRONTEND
1209 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1210 {
1211 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1213 };
1214
1215 return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1216 #else
1217 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1218 {
1219 -1, -1, -1, -1, -1, -1, -1, -1,
1220 0, 0, 0, 0, 0, 0, 0, 0
1221 };
1222
1223 return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1224 #endif
1225 }
1226
1227 bool Assemble(uint32_t slot, simdvector verts[])
1228 {
1229 SWR_ASSERT(slot < m_numAttributes);
1230
1231 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1232 if (0 == numPrimsToAssemble)
1233 {
1234 return false;
1235 }
1236
1237 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1238
1239 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1240 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1241 {
1242 #if USE_SIMD16_FRONTEND
1243 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1244 #else
1245 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1246 #endif
1247
1248 const float* pBase = pBaseAttrib;
1249 for (uint32_t c = 0; c < 4; ++c)
1250 {
1251 #if USE_SIMD16_FRONTEND
1252 simd16scalar temp = _simd16_mask_i32gather_ps(
1253 _simd16_setzero_ps(),
1254 pBase,
1255 indices,
1256 mask,
1257 4 /* gcc doesn't like sizeof(float) */);
1258
1259 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1260 #else
1261 verts[i].v[c] = _simd_mask_i32gather_ps(
1262 _simd_setzero_ps(),
1263 pBase,
1264 indices,
1265 _simd_castsi_ps(mask),
1266 4 /* gcc doesn't like sizeof(float) */);
1267 #endif
1268 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1269 }
1270 }
1271
1272 return true;
1273 }
1274
1275 #if ENABLE_AVX512_SIMD16
1276 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1277 {
1278 SWR_ASSERT(slot < m_numAttributes);
1279
1280 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1281 if (0 == numPrimsToAssemble)
1282 {
1283 return false;
1284 }
1285
1286 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1287
1288 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1289 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1290 {
1291 #if USE_SIMD16_FRONTEND
1292 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1293 #else
1294 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1295 #endif
1296
1297 const float* pBase = pBaseAttrib;
1298 for (uint32_t c = 0; c < 4; ++c)
1299 {
1300 #if USE_SIMD16_FRONTEND
1301 verts[i].v[c] = _simd16_mask_i32gather_ps(
1302 _simd16_setzero_ps(),
1303 pBase,
1304 indices,
1305 mask,
1306 4 /* gcc doesn't like sizeof(float) */);
1307 #else
1308 simdscalar temp = _simd_mask_i32gather_ps(
1309 _simd_setzero_ps(),
1310 pBase,
1311 indices,
1312 _simd_castsi_ps(mask),
1313 4 /* gcc doesn't like sizeof(float) */);
1314 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1315 #endif
1316 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1317 }
1318 }
1319
1320 return true;
1321 }
1322
1323 #endif
1324 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1325 {
1326 SWR_ASSERT(slot < m_numAttributes);
1327 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1328
1329 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1330 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1331 {
1332 #if USE_SIMD16_FRONTEND
1333 uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1334 #else
1335 uint32_t index = m_ppIndices[i][primIndex];
1336 #endif
1337 const float* pVertData = pVertDataBase;
1338 float* pVert = (float*)&verts[i];
1339
1340 for (uint32_t c = 0; c < 4; ++c)
1341 {
1342 pVert[c] = pVertData[index];
1343 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1344 }
1345 }
1346 }
1347
1348 bool NextPrim()
1349 {
1350 uint32_t numPrims = PA_TESS::NumPrims();
1351 m_numPrims -= numPrims;
1352 m_ppIndices[0] += numPrims;
1353 m_ppIndices[1] += numPrims;
1354 m_ppIndices[2] += numPrims;
1355
1356 return HasWork();
1357 }
1358
1359 SIMDVERTEX& GetNextVsOutput()
1360 {
1361 SWR_NOT_IMPL;
1362 return junkVertex;
1363 }
1364
1365 bool GetNextStreamOutput()
1366 {
1367 SWR_NOT_IMPL;
1368 return false;
1369 }
1370
1371 SIMDMASK& GetNextVsIndices()
1372 {
1373 SWR_NOT_IMPL;
1374 return junkIndices;
1375 }
1376
1377 uint32_t NumPrims()
1378 {
1379 return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1380 }
1381
1382 void Reset()
1383 {
1384 SWR_NOT_IMPL;
1385 }
1386
1387 SIMDSCALARI GetPrimID(uint32_t startID)
1388 {
1389 #if USE_SIMD16_FRONTEND
1390 return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1391 #else
1392 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1393 #endif
1394 }
1395
1396 private:
1397 const SIMDSCALAR* m_pVertexData = nullptr;
1398 uint32_t m_attributeStrideInVectors = 0;
1399 uint32_t m_numAttributes = 0;
1400 uint32_t m_numPrims = 0;
1401 uint32_t* m_ppIndices[3];
1402
1403 uint32_t m_numVertsPerPrim = 0;
1404
1405 SIMDSCALARI m_vPrimId;
1406
1407 simdvector junkVector; // junk simdvector for unimplemented API
1408 #if ENABLE_AVX512_SIMD16
1409 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
1410 #endif
1411 SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
1412 SIMDMASK junkIndices; // temporary index store for unused virtual function
1413 };
1414
1415 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1416 // based on state.
1417 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1418 struct PA_FACTORY
1419 {
1420 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
1421 {
1422 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1423 const API_STATE& state = GetApiState(pDC);
1424 if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1425 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1426 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1427 topo == TOP_TRIANGLE_LIST)) ||
1428
1429 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1430 // for them in the optimized PA
1431 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1432 {
1433 memset(&indexStore, 0, sizeof(indexStore));
1434 uint32_t numAttribs = state.feNumAttributes;
1435
1436 new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1437 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1438 cutPA = true;
1439 }
1440 else
1441 #endif
1442 {
1443 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1444 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
1445 cutPA = false;
1446 }
1447
1448 }
1449
1450 PA_STATE& GetPA()
1451 {
1452 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1453 if (cutPA)
1454 {
1455 return this->paCut;
1456 }
1457 else
1458 #endif
1459 {
1460 return this->paOpt;
1461 }
1462 }
1463
1464 PA_STATE_OPT paOpt;
1465 PA_STATE_CUT paCut;
1466
1467 bool cutPA{ false };
1468
1469 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1470
1471 PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1472 };