swr: [rasterizer core] Finish SIMD16 PA OPT except tesselation
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38 enum
39 {
40 SIMD_WIDTH = KNOB_SIMD16_WIDTH,
41 SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42 SIMD_WIDTH_LOG2 = 4
43 };
44
45 typedef simd16mask SIMDMASK;
46
47 typedef simd16scalar SIMDSCALAR;
48 typedef simd16vector SIMDVECTOR;
49 typedef simd16vertex SIMDVERTEX;
50
51 typedef simd16scalari SIMDSCALARI;
52
53 #else
54 enum
55 {
56 SIMD_WIDTH = KNOB_SIMD_WIDTH,
57 SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58 SIMD_WIDTH_LOG2 = 3
59 };
60
61 typedef simdmask SIMDMASK;
62
63 typedef simdscalar SIMDSCALAR;
64 typedef simdvector SIMDVECTOR;
65 typedef simdvertex SIMDVERTEX;
66
67 typedef simdscalari SIMDSCALARI;
68
69 #endif
70 DRAW_CONTEXT *pDC{ nullptr }; // draw context
71 uint8_t* pStreamBase{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
73
74 // The topology the binner will use. In some cases the FE changes the topology from the api state.
75 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
76
77 #if ENABLE_AVX512_SIMD16
78 bool useAlternateOffset{ false };
79
80 #endif
81 PA_STATE() {}
82 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
83 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
84
85 virtual bool HasWork() = 0;
86 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
87 #if ENABLE_AVX512_SIMD16
88 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
89 #endif
90 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
91 #if ENABLE_AVX512_SIMD16
92 virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
93 #endif
94 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
95 virtual bool NextPrim() = 0;
96 virtual SIMDVERTEX& GetNextVsOutput() = 0;
97 virtual bool GetNextStreamOutput() = 0;
98 virtual SIMDMASK& GetNextVsIndices() = 0;
99 virtual uint32_t NumPrims() = 0;
100 virtual void Reset() = 0;
101 virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
102 };
103
104 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
105 // output. Here is the sequence
106 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
107 // 2. Execute PA function to assemble and bin triangles.
108 // a. The PA function is a set of functions that collectively make up the
109 // state machine for a given topology.
110 // 1. We use a state index to track which PA function to call.
111 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
112 // 1. We call this the current and previous simd vertex.
113 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
114 // order to assemble the second triangle, for a triangle list, we'll need the
115 // last vertex from the previous simd and the first 2 vertices from the current simd.
116 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
117 //
118 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
119 // cuts
120 struct PA_STATE_OPT : public PA_STATE
121 {
122 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
123 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
124
125 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
126
127 uint32_t cur{ 0 }; // index to current VS output.
128 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
129 uint32_t first{ 0 }; // index to first VS output. Used for trifan.
130
131 uint32_t counter{ 0 }; // state counter
132 bool reset{ false }; // reset state
133
134 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
135 SIMDSCALARI primID;
136
137 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
138 #if ENABLE_AVX512_SIMD16
139 typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
140 #endif
141 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
142
143 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
144 #if ENABLE_AVX512_SIMD16
145 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
146 #endif
147 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
148 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
149 #if ENABLE_AVX512_SIMD16
150 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
151 #endif
152
153 // state used to advance the PA when Next is called
154 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
155 #if ENABLE_AVX512_SIMD16
156 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
157 #endif
158 uint32_t nextNumSimdPrims{ 0 };
159 uint32_t nextNumPrimsIncrement{ 0 };
160 bool nextReset{ false };
161 bool isStreaming{ false };
162
163 SIMDMASK tmpIndices{ 0 }; // temporary index store for unused virtual function
164
165 PA_STATE_OPT() {}
166 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
167 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
168
169 bool HasWork()
170 {
171 return (this->numPrimsComplete < this->numPrims) ? true : false;
172 }
173
174 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
175 {
176 simdvertex* pVertex = (simdvertex*)pStreamBase;
177 return pVertex[index].attrib[slot];
178 }
179
180 #if ENABLE_AVX512_SIMD16
181 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
182 {
183 simd16vertex* pVertex = (simd16vertex*)pStreamBase;
184 return pVertex[index].attrib[slot];
185 }
186
187 #endif
188 // Assembles 4 triangles. Each simdvector is a single vertex from 4
189 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
190 bool Assemble(uint32_t slot, simdvector verts[])
191 {
192 return this->pfnPaFunc(*this, slot, verts);
193 }
194
195 #if ENABLE_AVX512_SIMD16
196 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
197 {
198 return this->pfnPaFunc_simd16(*this, slot, verts);
199 }
200
201 #endif
202 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
203 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
204 {
205 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
206 }
207
208 bool NextPrim()
209 {
210 this->pfnPaFunc = this->pfnPaNextFunc;
211 #if ENABLE_AVX512_SIMD16
212 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
213 #endif
214 this->numSimdPrims = this->nextNumSimdPrims;
215 this->numPrimsComplete += this->nextNumPrimsIncrement;
216 this->reset = this->nextReset;
217
218 if (this->isStreaming)
219 {
220 this->reset = false;
221 }
222
223 bool morePrims = false;
224
225 if (this->numSimdPrims > 0)
226 {
227 morePrims = true;
228 this->numSimdPrims--;
229 }
230 else
231 {
232 this->counter = (this->reset) ? 0 : (this->counter + 1);
233 this->reset = false;
234 }
235
236 this->pfnPaFunc = this->pfnPaNextFunc;
237
238 if (!HasWork())
239 {
240 morePrims = false; // no more to do
241 }
242
243 return morePrims;
244 }
245
246 SIMDVERTEX& GetNextVsOutput()
247 {
248 // increment cur and prev indices
249 const uint32_t numSimdVerts = this->streamSizeInVerts / SIMD_WIDTH;
250 this->prev = this->cur; // prev is undefined for first state.
251 this->cur = this->counter % numSimdVerts;
252
253 SIMDVERTEX* pVertex = (SIMDVERTEX*)pStreamBase;
254 return pVertex[this->cur];
255 }
256
257 SIMDMASK& GetNextVsIndices()
258 {
259 // unused in optimized PA, pass tmp buffer back
260 return tmpIndices;
261 }
262
263 bool GetNextStreamOutput()
264 {
265 this->prev = this->cur;
266 this->cur = this->counter;
267
268 return HasWork();
269 }
270
271 uint32_t NumPrims()
272 {
273 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
274 (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
275 }
276
277 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
278 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
279 uint32_t numSimdPrims = 0,
280 uint32_t numPrimsIncrement = 0,
281 bool reset = false)
282 {
283 this->pfnPaNextFunc = pfnPaNextFunc;
284 this->nextNumSimdPrims = numSimdPrims;
285 this->nextNumPrimsIncrement = numPrimsIncrement;
286 this->nextReset = reset;
287
288 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
289 }
290
291 #if ENABLE_AVX512_SIMD16
292 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
293 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
294 uint32_t numSimdPrims = 0,
295 uint32_t numPrimsIncrement = 0,
296 bool reset = false)
297 {
298 this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
299 this->nextNumSimdPrims = numSimdPrims;
300 this->nextNumPrimsIncrement = numPrimsIncrement;
301 this->nextReset = reset;
302
303 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
304 }
305
306 #endif
307 void Reset()
308 {
309 #if ENABLE_AVX512_SIMD16
310 useAlternateOffset = false;
311
312 #endif
313 this->pfnPaFunc = this->pfnPaFuncReset;
314 this->numPrimsComplete = 0;
315 this->numSimdPrims = 0;
316 this->cur = 0;
317 this->prev = 0;
318 this->first = 0;
319 this->counter = 0;
320 this->reset = false;
321 }
322
323 SIMDSCALARI GetPrimID(uint32_t startID)
324 {
325 #if USE_SIMD16_FRONTEND
326 return _simd16_add_epi32(this->primID,
327 _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
328 #else
329 return _simd_add_epi32(this->primID,
330 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
331 #endif
332 }
333 };
334
335 // helper C wrappers to avoid having to rewrite all the PA topology state functions
336 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
337 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
338 uint32_t numSimdPrims = 0,
339 uint32_t numPrimsIncrement = 0,
340 bool reset = false)
341 {
342 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
343 }
344
345 #if ENABLE_AVX512_SIMD16
346 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
347 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
348 uint32_t numSimdPrims = 0,
349 uint32_t numPrimsIncrement = 0,
350 bool reset = false)
351 {
352 return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
353 }
354
355 #endif
356 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
357 {
358 return pa.GetSimdVector(index, slot);
359 }
360
361 #if ENABLE_AVX512_SIMD16
362 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
363 {
364 return pa.GetSimdVector_simd16(index, slot);
365 }
366
367 #endif
368 // Cut-aware primitive assembler.
369 struct PA_STATE_CUT : public PA_STATE
370 {
371 SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
372 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
373 uint32_t numAttribs{ 0 }; // number of attributes
374 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
375 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
376 #if ENABLE_AVX512_SIMD16
377 OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
378 #else
379 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
380 #endif
381 SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
382 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
383 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
384 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
385 uint32_t curVertex{ 0 }; // current unprocessed vertex
386 uint32_t startPrimId{ 0 }; // starting prim id
387 SIMDSCALARI vPrimId; // vector of prim ID
388 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
389 uint32_t vertsPerPrim{ 0 };
390 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
391 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
392 // while the GS sends valid verts for every index
393 // Topology state tracking
394 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
395 uint32_t curIndex{ 0 };
396 bool reverseWinding{ false }; // indicates reverse winding for strips
397 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
398
399 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
400 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
401
402 PA_STATE_CUT() {}
403 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, SIMDMASK* in_pIndices, uint32_t in_numVerts,
404 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
405 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
406 {
407 numVerts = in_streamSizeInVerts;
408 numAttribs = in_numAttribs;
409 binTopology = topo;
410 needOffsets = false;
411 processCutVerts = in_processCutVerts;
412
413 numVertsToAssemble = numRemainingVerts = in_numVerts;
414 numPrimsAssembled = 0;
415 headVertex = tailVertex = curVertex = 0;
416
417 curIndex = 0;
418 pCutIndices = in_pIndices;
419 memset(indices, 0, sizeof(indices));
420 #if USE_SIMD16_FRONTEND
421 vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
422 #else
423 vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
424 #endif
425 reverseWinding = false;
426 adjExtraVert = -1;
427
428 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
429 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
430
431 switch (topo)
432 {
433 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
434 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
435 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
436 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
437 {
438 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
439 }
440 else
441 {
442 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
443 }
444 break;
445
446 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
447 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
448 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
449 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
450 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
451 default: assert(0 && "Unimplemented topology");
452 }
453 }
454
455 SIMDVERTEX& GetNextVsOutput()
456 {
457 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
458 this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
459 this->needOffsets = true;
460 return ((SIMDVERTEX*)pStreamBase)[vertexIndex];
461 }
462
463 SIMDMASK& GetNextVsIndices()
464 {
465 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
466 SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
467 return *pCurCutIndex;
468 }
469
470 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
471 {
472 // unused
473 SWR_ASSERT(0 && "Not implemented");
474 static simdvector junk;
475 return junk;
476 }
477
478 #if ENABLE_AVX512_SIMD16
479 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
480 {
481 // unused
482 SWR_ASSERT(0 && "Not implemented");
483 static simd16vector junk;
484 return junk;
485 }
486
487 #endif
488 bool GetNextStreamOutput()
489 {
490 this->headVertex += SIMD_WIDTH;
491 this->needOffsets = true;
492 return HasWork();
493 }
494
495 SIMDSCALARI GetPrimID(uint32_t startID)
496 {
497 #if USE_SIMD16_FRONTEND
498 return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
499 #else
500 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
501 #endif
502 }
503
504 void Reset()
505 {
506 #if ENABLE_AVX512_SIMD16
507 useAlternateOffset = false;
508
509 #endif
510 this->numRemainingVerts = this->numVertsToAssemble;
511 this->numPrimsAssembled = 0;
512 this->curIndex = 0;
513 this->curVertex = 0;
514 this->tailVertex = 0;
515 this->headVertex = 0;
516 this->reverseWinding = false;
517 this->adjExtraVert = -1;
518 #if USE_SIMD16_FRONTEND
519 this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
520 #else
521 this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
522 #endif
523 }
524
525 bool HasWork()
526 {
527 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
528 }
529
530 bool IsVertexStoreFull()
531 {
532 return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
533 }
534
535 void RestartTopology()
536 {
537 this->curIndex = 0;
538 this->reverseWinding = false;
539 this->adjExtraVert = -1;
540 }
541
542 bool IsCutIndex(uint32_t vertex)
543 {
544 uint32_t vertexIndex = vertex / SIMD_WIDTH;
545 uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
546 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
547 }
548
549 // iterates across the unprocessed verts until we hit the end or we
550 // have assembled SIMD prims
551 void ProcessVerts()
552 {
553 while (this->numPrimsAssembled != SIMD_WIDTH &&
554 this->numRemainingVerts > 0 &&
555 this->curVertex != this->headVertex)
556 {
557 // if cut index, restart topology
558 if (IsCutIndex(this->curVertex))
559 {
560 if (this->processCutVerts)
561 {
562 (this->*pfnPa)(this->curVertex, false);
563 }
564 // finish off tri strip w/ adj before restarting topo
565 if (this->adjExtraVert != -1)
566 {
567 (this->*pfnPa)(this->curVertex, true);
568 }
569 RestartTopology();
570 }
571 else
572 {
573 (this->*pfnPa)(this->curVertex, false);
574 }
575
576 this->curVertex++;
577 if (this->curVertex >= this->numVerts) {
578 this->curVertex = 0;
579 }
580 this->numRemainingVerts--;
581 }
582
583 // special case last primitive for tri strip w/ adj
584 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
585 {
586 (this->*pfnPa)(this->curVertex, true);
587 }
588 }
589
590 void Advance()
591 {
592 // done with current batch
593 // advance tail to the current unsubmitted vertex
594 this->tailVertex = this->curVertex;
595 this->numPrimsAssembled = 0;
596 #if USE_SIMD16_FRONTEND
597 this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
598 #else
599 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
600 #endif
601 }
602
603 bool NextPrim()
604 {
605 // if we've assembled enough prims, we can advance to the next set of verts
606 if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
607 {
608 Advance();
609 }
610 return false;
611 }
612
613 void ComputeOffsets()
614 {
615 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
616 {
617 SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
618
619 // step to simdvertex batch
620 const uint32_t simdShift = SIMD_WIDTH_LOG2;
621 #if USE_SIMD16_FRONTEND
622 SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
623 this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(sizeof(SIMDVERTEX)));
624 #else
625 SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
626 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(SIMDVERTEX)));
627 #endif
628
629 // step to index
630 const uint32_t simdMask = SIMD_WIDTH - 1;
631 #if USE_SIMD16_FRONTEND
632 SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
633 this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
634 #else
635 SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
636 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
637 #endif
638 }
639 }
640
641 bool Assemble(uint32_t slot, simdvector verts[])
642 {
643 // process any outstanding verts
644 ProcessVerts();
645
646 // return false if we don't have enough prims assembled
647 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
648 {
649 return false;
650 }
651
652 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
653 if (this->needOffsets)
654 {
655 ComputeOffsets();
656 this->needOffsets = false;
657 }
658
659 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
660 {
661 SIMDSCALARI offsets = this->vOffsets[v];
662
663 // step to attribute
664 #if USE_SIMD16_FRONTEND
665 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
666 #else
667 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
668 #endif
669
670 float* pBase = (float*)this->pStreamBase;
671 for (uint32_t c = 0; c < 4; ++c)
672 {
673 #if USE_SIMD16_FRONTEND
674 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
675
676 verts[v].v[c] = useAlternateOffset ? temp.hi : temp.lo;
677 #else
678 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
679 #endif
680
681 // move base to next component
682 pBase += SIMD_WIDTH;
683 }
684 }
685
686 return true;
687 }
688
689 #if ENABLE_AVX512_SIMD16
690 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
691 {
692 // process any outstanding verts
693 ProcessVerts();
694
695 // return false if we don't have enough prims assembled
696 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
697 {
698 return false;
699 }
700
701 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
702 if (this->needOffsets)
703 {
704 ComputeOffsets();
705 this->needOffsets = false;
706 }
707
708 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
709 {
710 SIMDSCALARI offsets = this->vOffsets[v];
711
712 // step to attribute
713 #if USE_SIMD16_FRONTEND
714 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
715 #else
716 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
717 #endif
718
719 float* pBase = (float*)this->pStreamBase;
720 for (uint32_t c = 0; c < 4; ++c)
721 {
722 #if USE_SIMD16_FRONTEND
723 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
724 #else
725 verts[v].v[c].lo = _simd_i32gather_ps(pBase, offsets, 1);
726 verts[v].v[c].hi = _simd_setzero_ps();
727 #endif
728
729 // move base to next component
730 pBase += SIMD_WIDTH;
731 }
732 }
733
734 return true;
735 }
736
737 #endif
738 void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
739 {
740 // move to slot
741 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
742 {
743 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
744 #if USE_SIMD16_FRONTEND
745 uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
746 #else
747 uint32_t offset = pOffset[triIndex];
748 #endif
749 offset += sizeof(SIMDVECTOR) * slot;
750 float* pVert = (float*)&tri[v];
751 for (uint32_t c = 0; c < 4; ++c)
752 {
753 float* pComponent = (float*)(this->pStreamBase + offset);
754 pVert[c] = *pComponent;
755 offset += SIMD_WIDTH * sizeof(float);
756 }
757 }
758 }
759
760 uint32_t NumPrims()
761 {
762 return this->numPrimsAssembled;
763 }
764
765 // Per-topology functions
766 void ProcessVertTriStrip(uint32_t index, bool finish)
767 {
768 this->vert[this->curIndex] = index;
769 this->curIndex++;
770 if (this->curIndex == 3)
771 {
772 // assembled enough verts for prim, add to gather indices
773 this->indices[0][this->numPrimsAssembled] = this->vert[0];
774 if (reverseWinding)
775 {
776 this->indices[1][this->numPrimsAssembled] = this->vert[2];
777 this->indices[2][this->numPrimsAssembled] = this->vert[1];
778 }
779 else
780 {
781 this->indices[1][this->numPrimsAssembled] = this->vert[1];
782 this->indices[2][this->numPrimsAssembled] = this->vert[2];
783 }
784
785 // increment numPrimsAssembled
786 this->numPrimsAssembled++;
787
788 // set up next prim state
789 this->vert[0] = this->vert[1];
790 this->vert[1] = this->vert[2];
791 this->curIndex = 2;
792 this->reverseWinding ^= 1;
793 }
794 }
795
796 template<bool gsEnabled>
797 void AssembleTriStripAdj()
798 {
799 if (!gsEnabled)
800 {
801 this->vert[1] = this->vert[2];
802 this->vert[2] = this->vert[4];
803
804 this->indices[0][this->numPrimsAssembled] = this->vert[0];
805 this->indices[1][this->numPrimsAssembled] = this->vert[1];
806 this->indices[2][this->numPrimsAssembled] = this->vert[2];
807
808 this->vert[4] = this->vert[2];
809 this->vert[2] = this->vert[1];
810 }
811 else
812 {
813 this->indices[0][this->numPrimsAssembled] = this->vert[0];
814 this->indices[1][this->numPrimsAssembled] = this->vert[1];
815 this->indices[2][this->numPrimsAssembled] = this->vert[2];
816 this->indices[3][this->numPrimsAssembled] = this->vert[3];
817 this->indices[4][this->numPrimsAssembled] = this->vert[4];
818 this->indices[5][this->numPrimsAssembled] = this->vert[5];
819 }
820 this->numPrimsAssembled++;
821 }
822
823
824 template<bool gsEnabled>
825 void ProcessVertTriStripAdj(uint32_t index, bool finish)
826 {
827 // handle last primitive of tristrip
828 if (finish && this->adjExtraVert != -1)
829 {
830 this->vert[3] = this->adjExtraVert;
831 AssembleTriStripAdj<gsEnabled>();
832 this->adjExtraVert = -1;
833 return;
834 }
835
836 switch (this->curIndex)
837 {
838 case 0:
839 case 1:
840 case 2:
841 case 4:
842 this->vert[this->curIndex] = index;
843 this->curIndex++;
844 break;
845 case 3:
846 this->vert[5] = index;
847 this->curIndex++;
848 break;
849 case 5:
850 if (this->adjExtraVert == -1)
851 {
852 this->adjExtraVert = index;
853 }
854 else
855 {
856 this->vert[3] = index;
857 if (!gsEnabled)
858 {
859 AssembleTriStripAdj<gsEnabled>();
860
861 uint32_t nextTri[6];
862 if (this->reverseWinding)
863 {
864 nextTri[0] = this->vert[4];
865 nextTri[1] = this->vert[0];
866 nextTri[2] = this->vert[2];
867 nextTri[4] = this->vert[3];
868 nextTri[5] = this->adjExtraVert;
869 }
870 else
871 {
872 nextTri[0] = this->vert[2];
873 nextTri[1] = this->adjExtraVert;
874 nextTri[2] = this->vert[3];
875 nextTri[4] = this->vert[4];
876 nextTri[5] = this->vert[0];
877 }
878 for (uint32_t i = 0; i < 6; ++i)
879 {
880 this->vert[i] = nextTri[i];
881 }
882
883 this->adjExtraVert = -1;
884 this->reverseWinding ^= 1;
885 }
886 else
887 {
888 this->curIndex++;
889 }
890 }
891 break;
892 case 6:
893 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
894 AssembleTriStripAdj<gsEnabled>();
895
896 uint32_t nextTri[6];
897 if (this->reverseWinding)
898 {
899 nextTri[0] = this->vert[4];
900 nextTri[1] = this->vert[0];
901 nextTri[2] = this->vert[2];
902 nextTri[4] = this->vert[3];
903 nextTri[5] = this->adjExtraVert;
904 }
905 else
906 {
907 nextTri[0] = this->vert[2];
908 nextTri[1] = this->adjExtraVert;
909 nextTri[2] = this->vert[3];
910 nextTri[4] = this->vert[4];
911 nextTri[5] = this->vert[0];
912 }
913 for (uint32_t i = 0; i < 6; ++i)
914 {
915 this->vert[i] = nextTri[i];
916 }
917 this->reverseWinding ^= 1;
918 this->adjExtraVert = index;
919 this->curIndex--;
920 break;
921 }
922 }
923
924 void ProcessVertTriList(uint32_t index, bool finish)
925 {
926 this->vert[this->curIndex] = index;
927 this->curIndex++;
928 if (this->curIndex == 3)
929 {
930 // assembled enough verts for prim, add to gather indices
931 this->indices[0][this->numPrimsAssembled] = this->vert[0];
932 this->indices[1][this->numPrimsAssembled] = this->vert[1];
933 this->indices[2][this->numPrimsAssembled] = this->vert[2];
934
935 // increment numPrimsAssembled
936 this->numPrimsAssembled++;
937
938 // set up next prim state
939 this->curIndex = 0;
940 }
941 }
942
943 void ProcessVertTriListAdj(uint32_t index, bool finish)
944 {
945 this->vert[this->curIndex] = index;
946 this->curIndex++;
947 if (this->curIndex == 6)
948 {
949 // assembled enough verts for prim, add to gather indices
950 this->indices[0][this->numPrimsAssembled] = this->vert[0];
951 this->indices[1][this->numPrimsAssembled] = this->vert[1];
952 this->indices[2][this->numPrimsAssembled] = this->vert[2];
953 this->indices[3][this->numPrimsAssembled] = this->vert[3];
954 this->indices[4][this->numPrimsAssembled] = this->vert[4];
955 this->indices[5][this->numPrimsAssembled] = this->vert[5];
956
957 // increment numPrimsAssembled
958 this->numPrimsAssembled++;
959
960 // set up next prim state
961 this->curIndex = 0;
962 }
963 }
964
965 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
966 {
967 this->vert[this->curIndex] = index;
968 this->curIndex++;
969 if (this->curIndex == 6)
970 {
971 // assembled enough verts for prim, add to gather indices
972 this->indices[0][this->numPrimsAssembled] = this->vert[0];
973 this->indices[1][this->numPrimsAssembled] = this->vert[2];
974 this->indices[2][this->numPrimsAssembled] = this->vert[4];
975
976 // increment numPrimsAssembled
977 this->numPrimsAssembled++;
978
979 // set up next prim state
980 this->curIndex = 0;
981 }
982 }
983
984
985 void ProcessVertLineList(uint32_t index, bool finish)
986 {
987 this->vert[this->curIndex] = index;
988 this->curIndex++;
989 if (this->curIndex == 2)
990 {
991 this->indices[0][this->numPrimsAssembled] = this->vert[0];
992 this->indices[1][this->numPrimsAssembled] = this->vert[1];
993
994 this->numPrimsAssembled++;
995 this->curIndex = 0;
996 }
997 }
998
999 void ProcessVertLineStrip(uint32_t index, bool finish)
1000 {
1001 this->vert[this->curIndex] = index;
1002 this->curIndex++;
1003 if (this->curIndex == 2)
1004 {
1005 // assembled enough verts for prim, add to gather indices
1006 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1007 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1008
1009 // increment numPrimsAssembled
1010 this->numPrimsAssembled++;
1011
1012 // set up next prim state
1013 this->vert[0] = this->vert[1];
1014 this->curIndex = 1;
1015 }
1016 }
1017
1018 void ProcessVertLineStripAdj(uint32_t index, bool finish)
1019 {
1020 this->vert[this->curIndex] = index;
1021 this->curIndex++;
1022 if (this->curIndex == 4)
1023 {
1024 // assembled enough verts for prim, add to gather indices
1025 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1026 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1027 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1028 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1029
1030 // increment numPrimsAssembled
1031 this->numPrimsAssembled++;
1032
1033 // set up next prim state
1034 this->vert[0] = this->vert[1];
1035 this->vert[1] = this->vert[2];
1036 this->vert[2] = this->vert[3];
1037 this->curIndex = 3;
1038 }
1039 }
1040
1041 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1042 {
1043 this->vert[this->curIndex] = index;
1044 this->curIndex++;
1045 if (this->curIndex == 4)
1046 {
1047 // assembled enough verts for prim, add to gather indices
1048 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1049 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1050
1051 // increment numPrimsAssembled
1052 this->numPrimsAssembled++;
1053
1054 // set up next prim state
1055 this->vert[0] = this->vert[1];
1056 this->vert[1] = this->vert[2];
1057 this->vert[2] = this->vert[3];
1058 this->curIndex = 3;
1059 }
1060 }
1061
1062 void ProcessVertLineListAdj(uint32_t index, bool finish)
1063 {
1064 this->vert[this->curIndex] = index;
1065 this->curIndex++;
1066 if (this->curIndex == 4)
1067 {
1068 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1069 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1070 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1071 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1072
1073 this->numPrimsAssembled++;
1074 this->curIndex = 0;
1075 }
1076 }
1077
1078 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1079 {
1080 this->vert[this->curIndex] = index;
1081 this->curIndex++;
1082 if (this->curIndex == 4)
1083 {
1084 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1085 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1086
1087 this->numPrimsAssembled++;
1088 this->curIndex = 0;
1089 }
1090 }
1091
1092 void ProcessVertPointList(uint32_t index, bool finish)
1093 {
1094 this->vert[this->curIndex] = index;
1095 this->curIndex++;
1096 if (this->curIndex == 1)
1097 {
1098 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1099 this->numPrimsAssembled++;
1100 this->curIndex = 0;
1101 }
1102 }
1103 };
1104
1105 // Primitive Assembly for data output from the DomainShader.
1106 struct PA_TESS : PA_STATE
1107 {
1108 PA_TESS(
1109 DRAW_CONTEXT *in_pDC,
1110 const SIMDSCALAR* in_pVertData,
1111 uint32_t in_attributeStrideInVectors,
1112 uint32_t in_numAttributes,
1113 uint32_t* (&in_ppIndices)[3],
1114 uint32_t in_numPrims,
1115 PRIMITIVE_TOPOLOGY in_binTopology) :
1116
1117 PA_STATE(in_pDC, nullptr, 0),
1118 m_pVertexData(in_pVertData),
1119 m_attributeStrideInVectors(in_attributeStrideInVectors),
1120 m_numAttributes(in_numAttributes),
1121 m_numPrims(in_numPrims)
1122 {
1123 #if USE_SIMD16_FRONTEND
1124 m_vPrimId = _simd16_setzero_si();
1125 #else
1126 m_vPrimId = _simd_setzero_si();
1127 #endif
1128 binTopology = in_binTopology;
1129 m_ppIndices[0] = in_ppIndices[0];
1130 m_ppIndices[1] = in_ppIndices[1];
1131 m_ppIndices[2] = in_ppIndices[2];
1132
1133 switch (binTopology)
1134 {
1135 case TOP_POINT_LIST:
1136 m_numVertsPerPrim = 1;
1137 break;
1138
1139 case TOP_LINE_LIST:
1140 m_numVertsPerPrim = 2;
1141 break;
1142
1143 case TOP_TRIANGLE_LIST:
1144 m_numVertsPerPrim = 3;
1145 break;
1146
1147 default:
1148 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1149 break;
1150 }
1151 }
1152
1153 bool HasWork()
1154 {
1155 return m_numPrims != 0;
1156 }
1157
1158 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1159 {
1160 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1161 static simdvector junk;
1162 return junk;
1163 }
1164
1165 #if ENABLE_AVX512_SIMD16
1166 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1167 {
1168 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1169 static simd16vector junk;
1170 return junk;
1171 }
1172
1173 #endif
1174 static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1175 {
1176 SWR_ASSERT(numPrims <= SIMD_WIDTH);
1177 #if USE_SIMD16_FRONTEND
1178 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1179 {
1180 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1182 };
1183
1184 return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1185 #else
1186 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1187 {
1188 -1, -1, -1, -1, -1, -1, -1, -1,
1189 0, 0, 0, 0, 0, 0, 0, 0
1190 };
1191
1192 return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1193 #endif
1194 }
1195
1196 bool Assemble(uint32_t slot, simdvector verts[])
1197 {
1198 SWR_ASSERT(slot < m_numAttributes);
1199
1200 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1201 if (0 == numPrimsToAssemble)
1202 {
1203 return false;
1204 }
1205
1206 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1207
1208 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1209 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1210 {
1211 #if USE_SIMD16_FRONTEND
1212 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1213 #else
1214 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1215 #endif
1216
1217 const float* pBase = pBaseAttrib;
1218 for (uint32_t c = 0; c < 4; ++c)
1219 {
1220 #if USE_SIMD16_FRONTEND
1221 simd16scalar temp = _simd16_mask_i32gather_ps(
1222 _simd16_setzero_ps(),
1223 pBase,
1224 indices,
1225 mask,
1226 4 /* gcc doesn't like sizeof(float) */);
1227
1228 verts[i].v[c] = useAlternateOffset ? temp.hi : temp.lo;
1229 #else
1230 verts[i].v[c] = _simd_mask_i32gather_ps(
1231 _simd_setzero_ps(),
1232 pBase,
1233 indices,
1234 _simd_castsi_ps(mask),
1235 4 /* gcc doesn't like sizeof(float) */);
1236 #endif
1237 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1238 }
1239 }
1240
1241 return true;
1242 }
1243
1244 #if ENABLE_AVX512_SIMD16
1245 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1246 {
1247 SWR_ASSERT(slot < m_numAttributes);
1248
1249 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1250 if (0 == numPrimsToAssemble)
1251 {
1252 return false;
1253 }
1254
1255 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1256
1257 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1258 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1259 {
1260 #if USE_SIMD16_FRONTEND
1261 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1262 #else
1263 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1264 #endif
1265
1266 const float* pBase = pBaseAttrib;
1267 for (uint32_t c = 0; c < 4; ++c)
1268 {
1269 #if USE_SIMD16_FRONTEND
1270 verts[i].v[c] = _simd16_mask_i32gather_ps(
1271 _simd16_setzero_ps(),
1272 pBase,
1273 indices,
1274 mask,
1275 4 /* gcc doesn't like sizeof(float) */);
1276 #else
1277 verts[i].v[c].lo = _simd_mask_i32gather_ps(
1278 _simd_setzero_ps(),
1279 pBase,
1280 indices,
1281 _simd_castsi_ps(mask),
1282 4 /* gcc doesn't like sizeof(float) */);
1283 verts[i].v[c].hi = _simd_setzero_ps();
1284 #endif
1285 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1286 }
1287 }
1288
1289 return true;
1290 }
1291
1292 #endif
1293 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1294 {
1295 SWR_ASSERT(slot < m_numAttributes);
1296 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1297
1298 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1299 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1300 {
1301 #if USE_SIMD16_FRONTEND
1302 uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1303 #else
1304 uint32_t index = m_ppIndices[i][primIndex];
1305 #endif
1306 const float* pVertData = pVertDataBase;
1307 float* pVert = (float*)&verts[i];
1308
1309 for (uint32_t c = 0; c < 4; ++c)
1310 {
1311 pVert[c] = pVertData[index];
1312 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1313 }
1314 }
1315 }
1316
1317 bool NextPrim()
1318 {
1319 uint32_t numPrims = PA_TESS::NumPrims();
1320 m_numPrims -= numPrims;
1321 m_ppIndices[0] += numPrims;
1322 m_ppIndices[1] += numPrims;
1323 m_ppIndices[2] += numPrims;
1324
1325 return HasWork();
1326 }
1327
1328 SIMDVERTEX& GetNextVsOutput()
1329 {
1330 SWR_ASSERT(0, "%s", __FUNCTION__);
1331 static SIMDVERTEX junk;
1332 return junk;
1333 }
1334
1335 bool GetNextStreamOutput()
1336 {
1337 SWR_ASSERT(0, "%s", __FUNCTION__);
1338 return false;
1339 }
1340
1341 SIMDMASK& GetNextVsIndices()
1342 {
1343 SWR_ASSERT(0, "%s", __FUNCTION__);
1344 static SIMDMASK junk;
1345 return junk;
1346 }
1347
1348 uint32_t NumPrims()
1349 {
1350 return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1351 }
1352
1353 void Reset() { SWR_ASSERT(0); };
1354
1355 SIMDSCALARI GetPrimID(uint32_t startID)
1356 {
1357 #if USE_SIMD16_FRONTEND
1358 return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1359 #else
1360 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1361 #endif
1362 }
1363
1364 private:
1365 const SIMDSCALAR* m_pVertexData = nullptr;
1366 uint32_t m_attributeStrideInVectors = 0;
1367 uint32_t m_numAttributes = 0;
1368 uint32_t m_numPrims = 0;
1369 uint32_t* m_ppIndices[3];
1370
1371 uint32_t m_numVertsPerPrim = 0;
1372
1373 SIMDSCALARI m_vPrimId;
1374 };
1375
1376 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1377 // based on state.
1378 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1379 struct PA_FACTORY
1380 {
1381 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1382 {
1383 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1384 const API_STATE& state = GetApiState(pDC);
1385 if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1386 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1387 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1388 topo == TOP_TRIANGLE_LIST)) ||
1389
1390 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1391 // for them in the optimized PA
1392 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1393 {
1394 memset(&indexStore, 0, sizeof(indexStore));
1395 uint32_t numAttribs = state.feNumAttributes;
1396
1397 new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH,
1398 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1399 cutPA = true;
1400 }
1401 else
1402 #endif
1403 {
1404 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1405 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false);
1406 cutPA = false;
1407 }
1408
1409 }
1410
1411 PA_STATE& GetPA()
1412 {
1413 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1414 if (cutPA)
1415 {
1416 return this->paCut;
1417 }
1418 else
1419 #endif
1420 {
1421 return this->paOpt;
1422 }
1423 }
1424
1425 PA_STATE_OPT paOpt;
1426 PA_STATE_CUT paCut;
1427 bool cutPA{ false };
1428
1429 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1430
1431 PA_STATE::SIMDVERTEX vertexStore[MAX_NUM_VERTS_PER_PRIM];
1432 PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1433 };