swr/rast: Separate RDTSC code from archrast
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38 enum
39 {
40 SIMD_WIDTH = KNOB_SIMD16_WIDTH,
41 SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42 SIMD_WIDTH_LOG2 = 4
43 };
44
45 typedef simd16mask SIMDMASK;
46
47 typedef simd16scalar SIMDSCALAR;
48 typedef simd16vector SIMDVECTOR;
49 typedef simd16vertex SIMDVERTEX;
50
51 typedef simd16scalari SIMDSCALARI;
52
53 #else
54 enum
55 {
56 SIMD_WIDTH = KNOB_SIMD_WIDTH,
57 SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58 SIMD_WIDTH_LOG2 = 3
59 };
60
61 typedef simdmask SIMDMASK;
62
63 typedef simdscalar SIMDSCALAR;
64 typedef simdvector SIMDVECTOR;
65 typedef simdvertex SIMDVERTEX;
66
67 typedef simdscalari SIMDSCALARI;
68
69 #endif
70 DRAW_CONTEXT *pDC{ nullptr }; // draw context
71 uint8_t* pStreamBase{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
73 uint32_t vertexStride{ 0 }; // stride of a vertex in simdvector units
74
75 // The topology the binner will use. In some cases the FE changes the topology from the api state.
76 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
77
78 #if ENABLE_AVX512_SIMD16
79 bool useAlternateOffset{ false };
80 #endif
81
82 bool viewportArrayActive{ false };
83 bool rtArrayActive { false };
84 uint32_t numVertsPerPrim{ 0 };
85
86 PA_STATE(){}
87 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) :
88 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {}
89
90 virtual bool HasWork() = 0;
91 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
92 #if ENABLE_AVX512_SIMD16
93 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
94 #endif
95 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
96 #if ENABLE_AVX512_SIMD16
97 virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
98 #endif
99 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
100 virtual bool NextPrim() = 0;
101 virtual SIMDVERTEX& GetNextVsOutput() = 0;
102 virtual bool GetNextStreamOutput() = 0;
103 virtual SIMDMASK& GetNextVsIndices() = 0;
104 virtual uint32_t NumPrims() = 0;
105 virtual void Reset() = 0;
106 virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
107 };
108
109 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
110 // output. Here is the sequence
111 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
112 // 2. Execute PA function to assemble and bin triangles.
113 // a. The PA function is a set of functions that collectively make up the
114 // state machine for a given topology.
115 // 1. We use a state index to track which PA function to call.
116 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
117 // 1. We call this the current and previous simd vertex.
118 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
119 // order to assemble the second triangle, for a triangle list, we'll need the
120 // last vertex from the previous simd and the first 2 vertices from the current simd.
121 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
122 //
123 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
124 // cuts
125 struct PA_STATE_OPT : public PA_STATE
126 {
127 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
128 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
129
130 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
131
132 uint32_t cur{ 0 }; // index to current VS output.
133 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
134 const uint32_t first{ 0 }; // index to first VS output. Used for tri fan and line loop.
135
136 uint32_t counter{ 0 }; // state counter
137 bool reset{ false }; // reset state
138
139 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
140 SIMDSCALARI primID;
141
142 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
143 #if ENABLE_AVX512_SIMD16
144 typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
145 #endif
146 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
147
148 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
149 #if ENABLE_AVX512_SIMD16
150 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
151 #endif
152 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
153 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
154 #if ENABLE_AVX512_SIMD16
155 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
156 #endif
157
158 // state used to advance the PA when Next is called
159 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
160 #if ENABLE_AVX512_SIMD16
161 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
162 #endif
163 uint32_t nextNumSimdPrims{ 0 };
164 uint32_t nextNumPrimsIncrement{ 0 };
165 bool nextReset{ false };
166 bool isStreaming{ false };
167
168 SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function
169
170 PA_STATE_OPT() {}
171 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
172 uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
173
174 bool HasWork()
175 {
176 return (this->numPrimsComplete < this->numPrims) ? true : false;
177 }
178
179 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
180 {
181 SWR_ASSERT(slot < vertexStride);
182 uint32_t offset = index * vertexStride + slot;
183 simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
184 return vertexSlot;
185 }
186
187 #if ENABLE_AVX512_SIMD16
188 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
189 {
190 SWR_ASSERT(slot < vertexStride);
191 uint32_t offset = index * vertexStride + slot;
192 simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
193 return vertexSlot;
194 }
195
196 #endif
197 // Assembles 4 triangles. Each simdvector is a single vertex from 4
198 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
199 bool Assemble(uint32_t slot, simdvector verts[])
200 {
201 return this->pfnPaFunc(*this, slot, verts);
202 }
203
204 #if ENABLE_AVX512_SIMD16
205 bool Assemble(uint32_t slot, simd16vector verts[])
206 {
207 return this->pfnPaFunc_simd16(*this, slot, verts);
208 }
209
210 #endif
211 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
212 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
213 {
214 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
215 }
216
217 bool NextPrim()
218 {
219 this->pfnPaFunc = this->pfnPaNextFunc;
220 #if ENABLE_AVX512_SIMD16
221 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
222 #endif
223 this->numSimdPrims = this->nextNumSimdPrims;
224 this->numPrimsComplete += this->nextNumPrimsIncrement;
225 this->reset = this->nextReset;
226
227 if (this->isStreaming)
228 {
229 this->reset = false;
230 }
231
232 bool morePrims = false;
233
234 if (this->numSimdPrims > 0)
235 {
236 morePrims = true;
237 this->numSimdPrims--;
238 }
239 else
240 {
241 this->counter = (this->reset) ? 0 : (this->counter + 1);
242 this->reset = false;
243 }
244
245 if (!HasWork())
246 {
247 morePrims = false; // no more to do
248 }
249
250 return morePrims;
251 }
252
253 SIMDVERTEX& GetNextVsOutput()
254 {
255 const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
256
257 // increment cur and prev indices
258 if (counter < numSimdVerts)
259 {
260 // prev undefined for first state
261 prev = cur;
262 cur = counter;
263 }
264 else
265 {
266 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
267 uint32_t temp = prev;
268
269 prev = cur;
270 cur = temp;
271 }
272
273 SWR_ASSERT(cur < numSimdVerts);
274 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
275
276 return *(SIMDVERTEX*)pVertex;
277 }
278
279 SIMDMASK& GetNextVsIndices()
280 {
281 // unused in optimized PA, pass tmp buffer back
282 return junkIndices;
283 }
284
285 bool GetNextStreamOutput()
286 {
287 this->prev = this->cur;
288 this->cur = this->counter;
289
290 return HasWork();
291 }
292
293 uint32_t NumPrims()
294 {
295 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
296 (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
297 }
298
299 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
300 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
301 uint32_t numSimdPrims = 0,
302 uint32_t numPrimsIncrement = 0,
303 bool reset = false)
304 {
305 this->pfnPaNextFunc = pfnPaNextFunc;
306 this->nextNumSimdPrims = numSimdPrims;
307 this->nextNumPrimsIncrement = numPrimsIncrement;
308 this->nextReset = reset;
309
310 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
311 }
312
313 #if ENABLE_AVX512_SIMD16
314 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
315 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
316 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
317 uint32_t numSimdPrims = 0,
318 uint32_t numPrimsIncrement = 0,
319 bool reset = false)
320 {
321 this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
322 this->pfnPaNextFunc = pfnPaNextFunc;
323 this->nextNumSimdPrims = numSimdPrims;
324 this->nextNumPrimsIncrement = numPrimsIncrement;
325 this->nextReset = reset;
326
327 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
328 }
329
330 #endif
331 void Reset()
332 {
333 #if ENABLE_AVX512_SIMD16
334 useAlternateOffset = false;
335
336 #endif
337 this->pfnPaFunc = this->pfnPaFuncReset;
338 #if ENABLE_AVX512_SIMD16
339 this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
340 #endif
341 this->numPrimsComplete = 0;
342 this->numSimdPrims = 0;
343 this->cur = 0;
344 this->prev = 0;
345 this->counter = 0;
346 this->reset = false;
347 }
348
349 SIMDSCALARI GetPrimID(uint32_t startID)
350 {
351 #if USE_SIMD16_FRONTEND
352 return _simd16_add_epi32(this->primID,
353 _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
354 #else
355 return _simd_add_epi32(this->primID,
356 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
357 #endif
358 }
359 };
360
361 // helper C wrappers to avoid having to rewrite all the PA topology state functions
362 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
363 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
364 uint32_t numSimdPrims = 0,
365 uint32_t numPrimsIncrement = 0,
366 bool reset = false)
367 {
368 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
369 }
370
371 #if ENABLE_AVX512_SIMD16
372 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
373 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
374 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
375 uint32_t numSimdPrims = 0,
376 uint32_t numPrimsIncrement = 0,
377 bool reset = false)
378 {
379 return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
380 }
381
382 #endif
383 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
384 {
385 return pa.GetSimdVector(index, slot);
386 }
387
388 #if ENABLE_AVX512_SIMD16
389 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
390 {
391 return pa.GetSimdVector_simd16(index, slot);
392 }
393
394 #endif
395 // Cut-aware primitive assembler.
396 struct PA_STATE_CUT : public PA_STATE
397 {
398 SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
399 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
400 uint32_t numAttribs{ 0 }; // number of attributes
401 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
402 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
403 #if ENABLE_AVX512_SIMD16
404 OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
405 #else
406 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
407 #endif
408 SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
409 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
410 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
411 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
412 uint32_t curVertex{ 0 }; // current unprocessed vertex
413 uint32_t startPrimId{ 0 }; // starting prim id
414 SIMDSCALARI vPrimId; // vector of prim ID
415 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
416 uint32_t vertsPerPrim{ 0 };
417 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
418 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
419 // while the GS sends valid verts for every index
420
421 simdvector junkVector; // junk simdvector for unimplemented API
422 #if ENABLE_AVX512_SIMD16
423 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
424 #endif
425
426 // Topology state tracking
427 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
428 uint32_t curIndex{ 0 };
429 bool reverseWinding{ false }; // indicates reverse winding for strips
430 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
431
432 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
433 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
434
435 PA_STATE_CUT() {}
436 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
437 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim)
438 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
439 {
440 numVerts = in_streamSizeInVerts;
441 numAttribs = in_numAttribs;
442 binTopology = topo;
443 needOffsets = false;
444 processCutVerts = in_processCutVerts;
445
446 numVertsToAssemble = numRemainingVerts = in_numVerts;
447 numPrimsAssembled = 0;
448 headVertex = tailVertex = curVertex = 0;
449
450 curIndex = 0;
451 pCutIndices = in_pIndices;
452 memset(indices, 0, sizeof(indices));
453 #if USE_SIMD16_FRONTEND
454 vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
455 #else
456 vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
457 #endif
458 reverseWinding = false;
459 adjExtraVert = -1;
460
461 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
462 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
463
464 switch (topo)
465 {
466 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
467 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
468 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
469 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
470 {
471 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
472 }
473 else
474 {
475 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
476 }
477 break;
478
479 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
480 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
481 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
482 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
483 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
484 default: assert(0 && "Unimplemented topology");
485 }
486 }
487
488 SIMDVERTEX& GetNextVsOutput()
489 {
490 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
491 this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
492 this->needOffsets = true;
493 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
494
495 return *(SIMDVERTEX*)pVertex;
496 }
497
498 SIMDMASK& GetNextVsIndices()
499 {
500 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
501 SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
502 return *pCurCutIndex;
503 }
504
505 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
506 {
507 // unused
508 SWR_ASSERT(0 && "Not implemented");
509 return junkVector;
510 }
511
512 #if ENABLE_AVX512_SIMD16
513 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
514 {
515 // unused
516 SWR_ASSERT(0 && "Not implemented");
517 return junkVector_simd16;
518 }
519
520 #endif
521 bool GetNextStreamOutput()
522 {
523 this->headVertex += SIMD_WIDTH;
524 this->needOffsets = true;
525 return HasWork();
526 }
527
528 SIMDSCALARI GetPrimID(uint32_t startID)
529 {
530 #if USE_SIMD16_FRONTEND
531 return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
532 #else
533 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
534 #endif
535 }
536
537 void Reset()
538 {
539 #if ENABLE_AVX512_SIMD16
540 useAlternateOffset = false;
541
542 #endif
543 this->numRemainingVerts = this->numVertsToAssemble;
544 this->numPrimsAssembled = 0;
545 this->curIndex = 0;
546 this->curVertex = 0;
547 this->tailVertex = 0;
548 this->headVertex = 0;
549 this->reverseWinding = false;
550 this->adjExtraVert = -1;
551 #if USE_SIMD16_FRONTEND
552 this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
553 #else
554 this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
555 #endif
556 }
557
558 bool HasWork()
559 {
560 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
561 }
562
563 bool IsVertexStoreFull()
564 {
565 return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
566 }
567
568 void RestartTopology()
569 {
570 this->curIndex = 0;
571 this->reverseWinding = false;
572 this->adjExtraVert = -1;
573 }
574
575 bool IsCutIndex(uint32_t vertex)
576 {
577 uint32_t vertexIndex = vertex / SIMD_WIDTH;
578 uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
579 return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
580 }
581
582 // iterates across the unprocessed verts until we hit the end or we
583 // have assembled SIMD prims
584 void ProcessVerts()
585 {
586 while (this->numPrimsAssembled != SIMD_WIDTH &&
587 this->numRemainingVerts > 0 &&
588 this->curVertex != this->headVertex)
589 {
590 // if cut index, restart topology
591 if (IsCutIndex(this->curVertex))
592 {
593 if (this->processCutVerts)
594 {
595 (this->*pfnPa)(this->curVertex, false);
596 }
597 // finish off tri strip w/ adj before restarting topo
598 if (this->adjExtraVert != -1)
599 {
600 (this->*pfnPa)(this->curVertex, true);
601 }
602 RestartTopology();
603 }
604 else
605 {
606 (this->*pfnPa)(this->curVertex, false);
607 }
608
609 this->curVertex++;
610 if (this->curVertex >= this->numVerts) {
611 this->curVertex = 0;
612 }
613 this->numRemainingVerts--;
614 }
615
616 // special case last primitive for tri strip w/ adj
617 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
618 {
619 (this->*pfnPa)(this->curVertex, true);
620 }
621 }
622
623 void Advance()
624 {
625 // done with current batch
626 // advance tail to the current unsubmitted vertex
627 this->tailVertex = this->curVertex;
628 this->numPrimsAssembled = 0;
629 #if USE_SIMD16_FRONTEND
630 this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
631 #else
632 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
633 #endif
634 }
635
636 bool NextPrim()
637 {
638 // if we've assembled enough prims, we can advance to the next set of verts
639 if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
640 {
641 Advance();
642 }
643 return false;
644 }
645
646 void ComputeOffsets()
647 {
648 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
649 {
650 uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
651 SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
652
653 // step to simdvertex batch
654 const uint32_t simdShift = SIMD_WIDTH_LOG2;
655 #if USE_SIMD16_FRONTEND
656 SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
657 this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
658 #else
659 SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
660 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
661 #endif
662
663 // step to index
664 const uint32_t simdMask = SIMD_WIDTH - 1;
665 #if USE_SIMD16_FRONTEND
666 SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
667 this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
668 #else
669 SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
670 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
671 #endif
672 }
673 }
674
675 bool Assemble(uint32_t slot, simdvector *verts)
676 {
677 // process any outstanding verts
678 ProcessVerts();
679
680 // return false if we don't have enough prims assembled
681 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
682 {
683 return false;
684 }
685
686 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
687 if (this->needOffsets)
688 {
689 ComputeOffsets();
690 this->needOffsets = false;
691 }
692
693 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
694 {
695 SIMDSCALARI offsets = this->vOffsets[v];
696
697 // step to attribute
698 #if USE_SIMD16_FRONTEND
699 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
700 #else
701 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
702 #endif
703
704 float* pBase = (float*)this->pStreamBase;
705 for (uint32_t c = 0; c < 4; ++c)
706 {
707 #if USE_SIMD16_FRONTEND
708 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
709
710 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
711 simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
712 verts[v].v[c] = t;
713 #else
714 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
715 #endif
716
717 // move base to next component
718 pBase += SIMD_WIDTH;
719 }
720 }
721
722 return true;
723 }
724
725 #if ENABLE_AVX512_SIMD16
726 bool Assemble(uint32_t slot, simd16vector verts[])
727 {
728 // process any outstanding verts
729 ProcessVerts();
730
731 // return false if we don't have enough prims assembled
732 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
733 {
734 return false;
735 }
736
737 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
738 if (this->needOffsets)
739 {
740 ComputeOffsets();
741 this->needOffsets = false;
742 }
743
744 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
745 {
746 SIMDSCALARI offsets = this->vOffsets[v];
747
748 // step to attribute
749 #if USE_SIMD16_FRONTEND
750 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
751 #else
752 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
753 #endif
754
755 float* pBase = (float*)this->pStreamBase;
756 for (uint32_t c = 0; c < 4; ++c)
757 {
758 #if USE_SIMD16_FRONTEND
759 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
760 #else
761 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
762 #endif
763
764 // move base to next component
765 pBase += SIMD_WIDTH;
766 }
767 }
768
769 return true;
770 }
771
772 #endif
773 void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
774 {
775 // move to slot
776 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
777 {
778 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
779 #if USE_SIMD16_FRONTEND
780 uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
781 #else
782 uint32_t offset = pOffset[triIndex];
783 #endif
784 offset += sizeof(SIMDVECTOR) * slot;
785 float* pVert = (float*)&tri[v];
786 for (uint32_t c = 0; c < 4; ++c)
787 {
788 float* pComponent = (float*)(this->pStreamBase + offset);
789 pVert[c] = *pComponent;
790 offset += SIMD_WIDTH * sizeof(float);
791 }
792 }
793 }
794
795 uint32_t NumPrims()
796 {
797 return this->numPrimsAssembled;
798 }
799
800 // Per-topology functions
801 void ProcessVertTriStrip(uint32_t index, bool finish)
802 {
803 this->vert[this->curIndex] = index;
804 this->curIndex++;
805 if (this->curIndex == 3)
806 {
807 // assembled enough verts for prim, add to gather indices
808 this->indices[0][this->numPrimsAssembled] = this->vert[0];
809 if (reverseWinding)
810 {
811 this->indices[1][this->numPrimsAssembled] = this->vert[2];
812 this->indices[2][this->numPrimsAssembled] = this->vert[1];
813 }
814 else
815 {
816 this->indices[1][this->numPrimsAssembled] = this->vert[1];
817 this->indices[2][this->numPrimsAssembled] = this->vert[2];
818 }
819
820 // increment numPrimsAssembled
821 this->numPrimsAssembled++;
822
823 // set up next prim state
824 this->vert[0] = this->vert[1];
825 this->vert[1] = this->vert[2];
826 this->curIndex = 2;
827 this->reverseWinding ^= 1;
828 }
829 }
830
831 template<bool gsEnabled>
832 void AssembleTriStripAdj()
833 {
834 if (!gsEnabled)
835 {
836 this->vert[1] = this->vert[2];
837 this->vert[2] = this->vert[4];
838
839 this->indices[0][this->numPrimsAssembled] = this->vert[0];
840 this->indices[1][this->numPrimsAssembled] = this->vert[1];
841 this->indices[2][this->numPrimsAssembled] = this->vert[2];
842
843 this->vert[4] = this->vert[2];
844 this->vert[2] = this->vert[1];
845 }
846 else
847 {
848 this->indices[0][this->numPrimsAssembled] = this->vert[0];
849 this->indices[1][this->numPrimsAssembled] = this->vert[1];
850 this->indices[2][this->numPrimsAssembled] = this->vert[2];
851 this->indices[3][this->numPrimsAssembled] = this->vert[3];
852 this->indices[4][this->numPrimsAssembled] = this->vert[4];
853 this->indices[5][this->numPrimsAssembled] = this->vert[5];
854 }
855 this->numPrimsAssembled++;
856 }
857
858
859 template<bool gsEnabled>
860 void ProcessVertTriStripAdj(uint32_t index, bool finish)
861 {
862 // handle last primitive of tristrip
863 if (finish && this->adjExtraVert != -1)
864 {
865 this->vert[3] = this->adjExtraVert;
866 AssembleTriStripAdj<gsEnabled>();
867 this->adjExtraVert = -1;
868 return;
869 }
870
871 switch (this->curIndex)
872 {
873 case 0:
874 case 1:
875 case 2:
876 case 4:
877 this->vert[this->curIndex] = index;
878 this->curIndex++;
879 break;
880 case 3:
881 this->vert[5] = index;
882 this->curIndex++;
883 break;
884 case 5:
885 if (this->adjExtraVert == -1)
886 {
887 this->adjExtraVert = index;
888 }
889 else
890 {
891 this->vert[3] = index;
892 if (!gsEnabled)
893 {
894 AssembleTriStripAdj<gsEnabled>();
895
896 uint32_t nextTri[6];
897 if (this->reverseWinding)
898 {
899 nextTri[0] = this->vert[4];
900 nextTri[1] = this->vert[0];
901 nextTri[2] = this->vert[2];
902 nextTri[4] = this->vert[3];
903 nextTri[5] = this->adjExtraVert;
904 }
905 else
906 {
907 nextTri[0] = this->vert[2];
908 nextTri[1] = this->adjExtraVert;
909 nextTri[2] = this->vert[3];
910 nextTri[4] = this->vert[4];
911 nextTri[5] = this->vert[0];
912 }
913 for (uint32_t i = 0; i < 6; ++i)
914 {
915 this->vert[i] = nextTri[i];
916 }
917
918 this->adjExtraVert = -1;
919 this->reverseWinding ^= 1;
920 }
921 else
922 {
923 this->curIndex++;
924 }
925 }
926 break;
927 case 6:
928 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
929 AssembleTriStripAdj<gsEnabled>();
930
931 uint32_t nextTri[6];
932 if (this->reverseWinding)
933 {
934 nextTri[0] = this->vert[4];
935 nextTri[1] = this->vert[0];
936 nextTri[2] = this->vert[2];
937 nextTri[4] = this->vert[3];
938 nextTri[5] = this->adjExtraVert;
939 }
940 else
941 {
942 nextTri[0] = this->vert[2];
943 nextTri[1] = this->adjExtraVert;
944 nextTri[2] = this->vert[3];
945 nextTri[4] = this->vert[4];
946 nextTri[5] = this->vert[0];
947 }
948 for (uint32_t i = 0; i < 6; ++i)
949 {
950 this->vert[i] = nextTri[i];
951 }
952 this->reverseWinding ^= 1;
953 this->adjExtraVert = index;
954 this->curIndex--;
955 break;
956 }
957 }
958
959 void ProcessVertTriList(uint32_t index, bool finish)
960 {
961 this->vert[this->curIndex] = index;
962 this->curIndex++;
963 if (this->curIndex == 3)
964 {
965 // assembled enough verts for prim, add to gather indices
966 this->indices[0][this->numPrimsAssembled] = this->vert[0];
967 this->indices[1][this->numPrimsAssembled] = this->vert[1];
968 this->indices[2][this->numPrimsAssembled] = this->vert[2];
969
970 // increment numPrimsAssembled
971 this->numPrimsAssembled++;
972
973 // set up next prim state
974 this->curIndex = 0;
975 }
976 }
977
978 void ProcessVertTriListAdj(uint32_t index, bool finish)
979 {
980 this->vert[this->curIndex] = index;
981 this->curIndex++;
982 if (this->curIndex == 6)
983 {
984 // assembled enough verts for prim, add to gather indices
985 this->indices[0][this->numPrimsAssembled] = this->vert[0];
986 this->indices[1][this->numPrimsAssembled] = this->vert[1];
987 this->indices[2][this->numPrimsAssembled] = this->vert[2];
988 this->indices[3][this->numPrimsAssembled] = this->vert[3];
989 this->indices[4][this->numPrimsAssembled] = this->vert[4];
990 this->indices[5][this->numPrimsAssembled] = this->vert[5];
991
992 // increment numPrimsAssembled
993 this->numPrimsAssembled++;
994
995 // set up next prim state
996 this->curIndex = 0;
997 }
998 }
999
1000 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
1001 {
1002 this->vert[this->curIndex] = index;
1003 this->curIndex++;
1004 if (this->curIndex == 6)
1005 {
1006 // assembled enough verts for prim, add to gather indices
1007 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1008 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1009 this->indices[2][this->numPrimsAssembled] = this->vert[4];
1010
1011 // increment numPrimsAssembled
1012 this->numPrimsAssembled++;
1013
1014 // set up next prim state
1015 this->curIndex = 0;
1016 }
1017 }
1018
1019
1020 void ProcessVertLineList(uint32_t index, bool finish)
1021 {
1022 this->vert[this->curIndex] = index;
1023 this->curIndex++;
1024 if (this->curIndex == 2)
1025 {
1026 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1027 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1028
1029 this->numPrimsAssembled++;
1030 this->curIndex = 0;
1031 }
1032 }
1033
1034 void ProcessVertLineStrip(uint32_t index, bool finish)
1035 {
1036 this->vert[this->curIndex] = index;
1037 this->curIndex++;
1038 if (this->curIndex == 2)
1039 {
1040 // assembled enough verts for prim, add to gather indices
1041 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1042 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1043
1044 // increment numPrimsAssembled
1045 this->numPrimsAssembled++;
1046
1047 // set up next prim state
1048 this->vert[0] = this->vert[1];
1049 this->curIndex = 1;
1050 }
1051 }
1052
1053 void ProcessVertLineStripAdj(uint32_t index, bool finish)
1054 {
1055 this->vert[this->curIndex] = index;
1056 this->curIndex++;
1057 if (this->curIndex == 4)
1058 {
1059 // assembled enough verts for prim, add to gather indices
1060 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1061 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1062 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1063 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1064
1065 // increment numPrimsAssembled
1066 this->numPrimsAssembled++;
1067
1068 // set up next prim state
1069 this->vert[0] = this->vert[1];
1070 this->vert[1] = this->vert[2];
1071 this->vert[2] = this->vert[3];
1072 this->curIndex = 3;
1073 }
1074 }
1075
1076 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1077 {
1078 this->vert[this->curIndex] = index;
1079 this->curIndex++;
1080 if (this->curIndex == 4)
1081 {
1082 // assembled enough verts for prim, add to gather indices
1083 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1084 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1085
1086 // increment numPrimsAssembled
1087 this->numPrimsAssembled++;
1088
1089 // set up next prim state
1090 this->vert[0] = this->vert[1];
1091 this->vert[1] = this->vert[2];
1092 this->vert[2] = this->vert[3];
1093 this->curIndex = 3;
1094 }
1095 }
1096
1097 void ProcessVertLineListAdj(uint32_t index, bool finish)
1098 {
1099 this->vert[this->curIndex] = index;
1100 this->curIndex++;
1101 if (this->curIndex == 4)
1102 {
1103 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1104 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1105 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1106 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1107
1108 this->numPrimsAssembled++;
1109 this->curIndex = 0;
1110 }
1111 }
1112
1113 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1114 {
1115 this->vert[this->curIndex] = index;
1116 this->curIndex++;
1117 if (this->curIndex == 4)
1118 {
1119 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1120 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1121
1122 this->numPrimsAssembled++;
1123 this->curIndex = 0;
1124 }
1125 }
1126
1127 void ProcessVertPointList(uint32_t index, bool finish)
1128 {
1129 this->vert[this->curIndex] = index;
1130 this->curIndex++;
1131 if (this->curIndex == 1)
1132 {
1133 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1134 this->numPrimsAssembled++;
1135 this->curIndex = 0;
1136 }
1137 }
1138 };
1139
1140 // Primitive Assembly for data output from the DomainShader.
1141 struct PA_TESS : PA_STATE
1142 {
1143 PA_TESS(
1144 DRAW_CONTEXT *in_pDC,
1145 const SIMDSCALAR* in_pVertData,
1146 uint32_t in_attributeStrideInVectors,
1147 uint32_t in_vertexStride,
1148 uint32_t in_numAttributes,
1149 uint32_t* (&in_ppIndices)[3],
1150 uint32_t in_numPrims,
1151 PRIMITIVE_TOPOLOGY in_binTopology,
1152 uint32_t numVertsPerPrim) :
1153
1154 PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
1155 m_pVertexData(in_pVertData),
1156 m_attributeStrideInVectors(in_attributeStrideInVectors),
1157 m_numAttributes(in_numAttributes),
1158 m_numPrims(in_numPrims)
1159 {
1160 #if USE_SIMD16_FRONTEND
1161 m_vPrimId = _simd16_setzero_si();
1162 #else
1163 m_vPrimId = _simd_setzero_si();
1164 #endif
1165 binTopology = in_binTopology;
1166 m_ppIndices[0] = in_ppIndices[0];
1167 m_ppIndices[1] = in_ppIndices[1];
1168 m_ppIndices[2] = in_ppIndices[2];
1169
1170 switch (binTopology)
1171 {
1172 case TOP_POINT_LIST:
1173 m_numVertsPerPrim = 1;
1174 break;
1175
1176 case TOP_LINE_LIST:
1177 m_numVertsPerPrim = 2;
1178 break;
1179
1180 case TOP_TRIANGLE_LIST:
1181 m_numVertsPerPrim = 3;
1182 break;
1183
1184 default:
1185 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1186 break;
1187 }
1188 }
1189
1190 bool HasWork()
1191 {
1192 return m_numPrims != 0;
1193 }
1194
1195 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1196 {
1197 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1198 return junkVector;
1199 }
1200
1201 #if ENABLE_AVX512_SIMD16
1202 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1203 {
1204 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1205 return junkVector_simd16;
1206 }
1207
1208 #endif
1209 static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1210 {
1211 SWR_ASSERT(numPrims <= SIMD_WIDTH);
1212 #if USE_SIMD16_FRONTEND
1213 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1214 {
1215 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1216 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1217 };
1218
1219 return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1220 #else
1221 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1222 {
1223 -1, -1, -1, -1, -1, -1, -1, -1,
1224 0, 0, 0, 0, 0, 0, 0, 0
1225 };
1226
1227 return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1228 #endif
1229 }
1230
1231 bool Assemble(uint32_t slot, simdvector verts[])
1232 {
1233 SWR_ASSERT(slot < m_numAttributes);
1234
1235 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1236 if (0 == numPrimsToAssemble)
1237 {
1238 return false;
1239 }
1240
1241 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1242
1243 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1244 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1245 {
1246 #if USE_SIMD16_FRONTEND
1247 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1248 #else
1249 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1250 #endif
1251
1252 const float* pBase = pBaseAttrib;
1253 for (uint32_t c = 0; c < 4; ++c)
1254 {
1255 #if USE_SIMD16_FRONTEND
1256 simd16scalar temp = _simd16_mask_i32gather_ps(
1257 _simd16_setzero_ps(),
1258 pBase,
1259 indices,
1260 _simd16_castsi_ps(mask),
1261 4 /* gcc doesn't like sizeof(float) */);
1262
1263 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1264 #else
1265 verts[i].v[c] = _simd_mask_i32gather_ps(
1266 _simd_setzero_ps(),
1267 pBase,
1268 indices,
1269 _simd_castsi_ps(mask),
1270 4); // gcc doesn't like sizeof(float)
1271 #endif
1272 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1273 }
1274 }
1275
1276 return true;
1277 }
1278
1279 #if ENABLE_AVX512_SIMD16
1280 bool Assemble(uint32_t slot, simd16vector verts[])
1281 {
1282 SWR_ASSERT(slot < m_numAttributes);
1283
1284 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1285 if (0 == numPrimsToAssemble)
1286 {
1287 return false;
1288 }
1289
1290 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1291
1292 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1293 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1294 {
1295 #if USE_SIMD16_FRONTEND
1296 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1297 #else
1298 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1299 #endif
1300
1301 const float* pBase = pBaseAttrib;
1302 for (uint32_t c = 0; c < 4; ++c)
1303 {
1304 #if USE_SIMD16_FRONTEND
1305 verts[i].v[c] = _simd16_mask_i32gather_ps(
1306 _simd16_setzero_ps(),
1307 pBase,
1308 indices,
1309 _simd16_castsi_ps(mask),
1310 4 /* gcc doesn't like sizeof(float) */);
1311 #else
1312 simdscalar temp = _simd_mask_i32gather_ps(
1313 _simd_setzero_ps(),
1314 pBase,
1315 indices,
1316 _simd_castsi_ps(mask),
1317 4 /* gcc doesn't like sizeof(float) */);
1318 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1319 #endif
1320 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1321 }
1322 }
1323
1324 return true;
1325 }
1326
1327 #endif
1328 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1329 {
1330 SWR_ASSERT(slot < m_numAttributes);
1331 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1332
1333 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1334 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1335 {
1336 #if USE_SIMD16_FRONTEND
1337 uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1338 #else
1339 uint32_t index = m_ppIndices[i][primIndex];
1340 #endif
1341 const float* pVertData = pVertDataBase;
1342 float* pVert = (float*)&verts[i];
1343
1344 for (uint32_t c = 0; c < 4; ++c)
1345 {
1346 pVert[c] = pVertData[index];
1347 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1348 }
1349 }
1350 }
1351
1352 bool NextPrim()
1353 {
1354 uint32_t numPrims = PA_TESS::NumPrims();
1355 m_numPrims -= numPrims;
1356 m_ppIndices[0] += numPrims;
1357 m_ppIndices[1] += numPrims;
1358 m_ppIndices[2] += numPrims;
1359
1360 return HasWork();
1361 }
1362
1363 SIMDVERTEX& GetNextVsOutput()
1364 {
1365 SWR_NOT_IMPL;
1366 return junkVertex;
1367 }
1368
1369 bool GetNextStreamOutput()
1370 {
1371 SWR_NOT_IMPL;
1372 return false;
1373 }
1374
1375 SIMDMASK& GetNextVsIndices()
1376 {
1377 SWR_NOT_IMPL;
1378 return junkIndices;
1379 }
1380
1381 uint32_t NumPrims()
1382 {
1383 return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1384 }
1385
1386 void Reset()
1387 {
1388 SWR_NOT_IMPL;
1389 }
1390
1391 SIMDSCALARI GetPrimID(uint32_t startID)
1392 {
1393 #if USE_SIMD16_FRONTEND
1394 return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1395 #else
1396 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1397 #endif
1398 }
1399
1400 private:
1401 const SIMDSCALAR* m_pVertexData = nullptr;
1402 uint32_t m_attributeStrideInVectors = 0;
1403 uint32_t m_numAttributes = 0;
1404 uint32_t m_numPrims = 0;
1405 uint32_t* m_ppIndices[3];
1406
1407 uint32_t m_numVertsPerPrim = 0;
1408
1409 SIMDSCALARI m_vPrimId;
1410
1411 simdvector junkVector; // junk simdvector for unimplemented API
1412 #if ENABLE_AVX512_SIMD16
1413 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
1414 #endif
1415 SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
1416 SIMDMASK junkIndices; // temporary index store for unused virtual function
1417 };
1418
1419 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1420 // based on state.
1421 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1422 struct PA_FACTORY
1423 {
1424 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo)
1425 {
1426 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1427 const API_STATE& state = GetApiState(pDC);
1428 if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1429 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1430 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1431 topo == TOP_TRIANGLE_LIST)) ||
1432
1433 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1434 // for them in the optimized PA
1435 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1436 {
1437 memset(&indexStore, 0, sizeof(indexStore));
1438 uint32_t numAttribs = state.feNumAttributes;
1439
1440 new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1441 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim);
1442 cutPA = true;
1443 }
1444 else
1445 #endif
1446 {
1447 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1448 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim);
1449 cutPA = false;
1450 }
1451
1452 }
1453
1454 PA_STATE& GetPA()
1455 {
1456 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1457 if (cutPA)
1458 {
1459 return this->paCut;
1460 }
1461 else
1462 #endif
1463 {
1464 return this->paOpt;
1465 }
1466 }
1467
1468 PA_STATE_OPT paOpt;
1469 PA_STATE_CUT paCut;
1470
1471 bool cutPA{ false };
1472
1473 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1474
1475 PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1476 };