swr/rast: cache line align hottile buffers
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38 enum
39 {
40 SIMD_WIDTH = KNOB_SIMD16_WIDTH,
41 SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42 SIMD_WIDTH_LOG2 = 4
43 };
44
45 typedef simd16mask SIMDMASK;
46
47 typedef simd16scalar SIMDSCALAR;
48 typedef simd16vector SIMDVECTOR;
49 typedef simd16vertex SIMDVERTEX;
50
51 typedef simd16scalari SIMDSCALARI;
52
53 #else
54 enum
55 {
56 SIMD_WIDTH = KNOB_SIMD_WIDTH,
57 SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58 SIMD_WIDTH_LOG2 = 3
59 };
60
61 typedef simdmask SIMDMASK;
62
63 typedef simdscalar SIMDSCALAR;
64 typedef simdvector SIMDVECTOR;
65 typedef simdvertex SIMDVERTEX;
66
67 typedef simdscalari SIMDSCALARI;
68
69 #endif
70 DRAW_CONTEXT *pDC{ nullptr }; // draw context
71 uint8_t* pStreamBase{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
73 uint32_t vertexStride{ 0 }; // stride of a vertex in simdvector units
74
75 // The topology the binner will use. In some cases the FE changes the topology from the api state.
76 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
77
78 #if ENABLE_AVX512_SIMD16
79 bool useAlternateOffset{ false };
80
81 #endif
82 PA_STATE() {}
83 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
84 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
85
86 virtual bool HasWork() = 0;
87 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
88 #if ENABLE_AVX512_SIMD16
89 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
90 #endif
91 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
92 #if ENABLE_AVX512_SIMD16
93 virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
94 #endif
95 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
96 virtual bool NextPrim() = 0;
97 virtual SIMDVERTEX& GetNextVsOutput() = 0;
98 virtual bool GetNextStreamOutput() = 0;
99 virtual SIMDMASK& GetNextVsIndices() = 0;
100 virtual uint32_t NumPrims() = 0;
101 virtual void Reset() = 0;
102 virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
103 };
104
105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
106 // output. Here is the sequence
107 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
108 // 2. Execute PA function to assemble and bin triangles.
109 // a. The PA function is a set of functions that collectively make up the
110 // state machine for a given topology.
111 // 1. We use a state index to track which PA function to call.
112 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
113 // 1. We call this the current and previous simd vertex.
114 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
115 // order to assemble the second triangle, for a triangle list, we'll need the
116 // last vertex from the previous simd and the first 2 vertices from the current simd.
117 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
118 //
119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
120 // cuts
121 struct PA_STATE_OPT : public PA_STATE
122 {
123 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
124 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
125
126 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
127
128 uint32_t cur{ 0 }; // index to current VS output.
129 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
130 const uint32_t first{ 0 }; // index to first VS output. Used for tri fan and line loop.
131
132 uint32_t counter{ 0 }; // state counter
133 bool reset{ false }; // reset state
134
135 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
136 SIMDSCALARI primID;
137
138 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
139 #if ENABLE_AVX512_SIMD16
140 typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
141 #endif
142 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
143
144 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
145 #if ENABLE_AVX512_SIMD16
146 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
147 #endif
148 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
149 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
150 #if ENABLE_AVX512_SIMD16
151 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
152 #endif
153
154 // state used to advance the PA when Next is called
155 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
156 #if ENABLE_AVX512_SIMD16
157 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
158 #endif
159 uint32_t nextNumSimdPrims{ 0 };
160 uint32_t nextNumPrimsIncrement{ 0 };
161 bool nextReset{ false };
162 bool isStreaming{ false };
163
164 SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function
165
166 PA_STATE_OPT() {}
167 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
168 uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
169
170 bool HasWork()
171 {
172 return (this->numPrimsComplete < this->numPrims) ? true : false;
173 }
174
175 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
176 {
177 SWR_ASSERT(slot < vertexStride);
178 uint32_t offset = index * vertexStride + slot;
179 simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
180 return vertexSlot;
181 }
182
183 #if ENABLE_AVX512_SIMD16
184 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
185 {
186 SWR_ASSERT(slot < vertexStride);
187 uint32_t offset = index * vertexStride + slot;
188 simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
189 return vertexSlot;
190 }
191
192 #endif
193 // Assembles 4 triangles. Each simdvector is a single vertex from 4
194 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
195 bool Assemble(uint32_t slot, simdvector verts[])
196 {
197 return this->pfnPaFunc(*this, slot, verts);
198 }
199
200 #if ENABLE_AVX512_SIMD16
201 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
202 {
203 return this->pfnPaFunc_simd16(*this, slot, verts);
204 }
205
206 #endif
207 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
208 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
209 {
210 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
211 }
212
213 bool NextPrim()
214 {
215 this->pfnPaFunc = this->pfnPaNextFunc;
216 #if ENABLE_AVX512_SIMD16
217 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
218 #endif
219 this->numSimdPrims = this->nextNumSimdPrims;
220 this->numPrimsComplete += this->nextNumPrimsIncrement;
221 this->reset = this->nextReset;
222
223 if (this->isStreaming)
224 {
225 this->reset = false;
226 }
227
228 bool morePrims = false;
229
230 if (this->numSimdPrims > 0)
231 {
232 morePrims = true;
233 this->numSimdPrims--;
234 }
235 else
236 {
237 this->counter = (this->reset) ? 0 : (this->counter + 1);
238 this->reset = false;
239 }
240
241 if (!HasWork())
242 {
243 morePrims = false; // no more to do
244 }
245
246 return morePrims;
247 }
248
249 SIMDVERTEX& GetNextVsOutput()
250 {
251 const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
252
253 // increment cur and prev indices
254 if (counter < numSimdVerts)
255 {
256 // prev undefined for first state
257 prev = cur;
258 cur = counter;
259 }
260 else
261 {
262 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
263 uint32_t temp = prev;
264
265 prev = cur;
266 cur = temp;
267 }
268
269 SWR_ASSERT(cur < numSimdVerts);
270 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
271
272 return *(SIMDVERTEX*)pVertex;
273 }
274
275 SIMDMASK& GetNextVsIndices()
276 {
277 // unused in optimized PA, pass tmp buffer back
278 return junkIndices;
279 }
280
281 bool GetNextStreamOutput()
282 {
283 this->prev = this->cur;
284 this->cur = this->counter;
285
286 return HasWork();
287 }
288
289 uint32_t NumPrims()
290 {
291 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
292 (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
293 }
294
295 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
296 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
297 uint32_t numSimdPrims = 0,
298 uint32_t numPrimsIncrement = 0,
299 bool reset = false)
300 {
301 this->pfnPaNextFunc = pfnPaNextFunc;
302 this->nextNumSimdPrims = numSimdPrims;
303 this->nextNumPrimsIncrement = numPrimsIncrement;
304 this->nextReset = reset;
305
306 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
307 }
308
309 #if ENABLE_AVX512_SIMD16
310 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
311 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
312 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
313 uint32_t numSimdPrims = 0,
314 uint32_t numPrimsIncrement = 0,
315 bool reset = false)
316 {
317 this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
318 this->pfnPaNextFunc = pfnPaNextFunc;
319 this->nextNumSimdPrims = numSimdPrims;
320 this->nextNumPrimsIncrement = numPrimsIncrement;
321 this->nextReset = reset;
322
323 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
324 }
325
326 #endif
327 void Reset()
328 {
329 #if ENABLE_AVX512_SIMD16
330 useAlternateOffset = false;
331
332 #endif
333 this->pfnPaFunc = this->pfnPaFuncReset;
334 #if ENABLE_AVX512_SIMD16
335 this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
336 #endif
337 this->numPrimsComplete = 0;
338 this->numSimdPrims = 0;
339 this->cur = 0;
340 this->prev = 0;
341 this->counter = 0;
342 this->reset = false;
343 }
344
345 SIMDSCALARI GetPrimID(uint32_t startID)
346 {
347 #if USE_SIMD16_FRONTEND
348 return _simd16_add_epi32(this->primID,
349 _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
350 #else
351 return _simd_add_epi32(this->primID,
352 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
353 #endif
354 }
355 };
356
357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
358 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
359 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
360 uint32_t numSimdPrims = 0,
361 uint32_t numPrimsIncrement = 0,
362 bool reset = false)
363 {
364 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
365 }
366
367 #if ENABLE_AVX512_SIMD16
368 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
369 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
370 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
371 uint32_t numSimdPrims = 0,
372 uint32_t numPrimsIncrement = 0,
373 bool reset = false)
374 {
375 return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
376 }
377
378 #endif
379 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
380 {
381 return pa.GetSimdVector(index, slot);
382 }
383
384 #if ENABLE_AVX512_SIMD16
385 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
386 {
387 return pa.GetSimdVector_simd16(index, slot);
388 }
389
390 #endif
391 // Cut-aware primitive assembler.
392 struct PA_STATE_CUT : public PA_STATE
393 {
394 SIMDMASK* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
395 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
396 uint32_t numAttribs{ 0 }; // number of attributes
397 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
398 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
399 #if ENABLE_AVX512_SIMD16
400 OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
401 #else
402 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
403 #endif
404 SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
405 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
406 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
407 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
408 uint32_t curVertex{ 0 }; // current unprocessed vertex
409 uint32_t startPrimId{ 0 }; // starting prim id
410 SIMDSCALARI vPrimId; // vector of prim ID
411 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
412 uint32_t vertsPerPrim{ 0 };
413 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
414 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
415 // while the GS sends valid verts for every index
416
417 simdvector junkVector; // junk simdvector for unimplemented API
418 #if ENABLE_AVX512_SIMD16
419 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
420 #endif
421
422 // Topology state tracking
423 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
424 uint32_t curIndex{ 0 };
425 bool reverseWinding{ false }; // indicates reverse winding for strips
426 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
427
428 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
429 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
430
431 PA_STATE_CUT() {}
432 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
433 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
434 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
435 {
436 numVerts = in_streamSizeInVerts;
437 numAttribs = in_numAttribs;
438 binTopology = topo;
439 needOffsets = false;
440 processCutVerts = in_processCutVerts;
441
442 numVertsToAssemble = numRemainingVerts = in_numVerts;
443 numPrimsAssembled = 0;
444 headVertex = tailVertex = curVertex = 0;
445
446 curIndex = 0;
447 pCutIndices = in_pIndices;
448 memset(indices, 0, sizeof(indices));
449 #if USE_SIMD16_FRONTEND
450 vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
451 #else
452 vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
453 #endif
454 reverseWinding = false;
455 adjExtraVert = -1;
456
457 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
458 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
459
460 switch (topo)
461 {
462 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
463 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
464 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
465 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
466 {
467 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
468 }
469 else
470 {
471 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
472 }
473 break;
474
475 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
476 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
477 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
478 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
479 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
480 default: assert(0 && "Unimplemented topology");
481 }
482 }
483
484 SIMDVERTEX& GetNextVsOutput()
485 {
486 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
487 this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
488 this->needOffsets = true;
489 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
490
491 return *(SIMDVERTEX*)pVertex;
492 }
493
494 SIMDMASK& GetNextVsIndices()
495 {
496 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
497 SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
498 return *pCurCutIndex;
499 }
500
501 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
502 {
503 // unused
504 SWR_ASSERT(0 && "Not implemented");
505 return junkVector;
506 }
507
508 #if ENABLE_AVX512_SIMD16
509 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
510 {
511 // unused
512 SWR_ASSERT(0 && "Not implemented");
513 return junkVector_simd16;
514 }
515
516 #endif
517 bool GetNextStreamOutput()
518 {
519 this->headVertex += SIMD_WIDTH;
520 this->needOffsets = true;
521 return HasWork();
522 }
523
524 SIMDSCALARI GetPrimID(uint32_t startID)
525 {
526 #if USE_SIMD16_FRONTEND
527 return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
528 #else
529 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
530 #endif
531 }
532
533 void Reset()
534 {
535 #if ENABLE_AVX512_SIMD16
536 useAlternateOffset = false;
537
538 #endif
539 this->numRemainingVerts = this->numVertsToAssemble;
540 this->numPrimsAssembled = 0;
541 this->curIndex = 0;
542 this->curVertex = 0;
543 this->tailVertex = 0;
544 this->headVertex = 0;
545 this->reverseWinding = false;
546 this->adjExtraVert = -1;
547 #if USE_SIMD16_FRONTEND
548 this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
549 #else
550 this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
551 #endif
552 }
553
554 bool HasWork()
555 {
556 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
557 }
558
559 bool IsVertexStoreFull()
560 {
561 return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
562 }
563
564 void RestartTopology()
565 {
566 this->curIndex = 0;
567 this->reverseWinding = false;
568 this->adjExtraVert = -1;
569 }
570
571 bool IsCutIndex(uint32_t vertex)
572 {
573 uint32_t vertexIndex = vertex / SIMD_WIDTH;
574 uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
575 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
576 }
577
578 // iterates across the unprocessed verts until we hit the end or we
579 // have assembled SIMD prims
580 void ProcessVerts()
581 {
582 while (this->numPrimsAssembled != SIMD_WIDTH &&
583 this->numRemainingVerts > 0 &&
584 this->curVertex != this->headVertex)
585 {
586 // if cut index, restart topology
587 if (IsCutIndex(this->curVertex))
588 {
589 if (this->processCutVerts)
590 {
591 (this->*pfnPa)(this->curVertex, false);
592 }
593 // finish off tri strip w/ adj before restarting topo
594 if (this->adjExtraVert != -1)
595 {
596 (this->*pfnPa)(this->curVertex, true);
597 }
598 RestartTopology();
599 }
600 else
601 {
602 (this->*pfnPa)(this->curVertex, false);
603 }
604
605 this->curVertex++;
606 if (this->curVertex >= this->numVerts) {
607 this->curVertex = 0;
608 }
609 this->numRemainingVerts--;
610 }
611
612 // special case last primitive for tri strip w/ adj
613 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
614 {
615 (this->*pfnPa)(this->curVertex, true);
616 }
617 }
618
619 void Advance()
620 {
621 // done with current batch
622 // advance tail to the current unsubmitted vertex
623 this->tailVertex = this->curVertex;
624 this->numPrimsAssembled = 0;
625 #if USE_SIMD16_FRONTEND
626 this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
627 #else
628 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
629 #endif
630 }
631
632 bool NextPrim()
633 {
634 // if we've assembled enough prims, we can advance to the next set of verts
635 if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
636 {
637 Advance();
638 }
639 return false;
640 }
641
642 void ComputeOffsets()
643 {
644 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
645 {
646 uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
647 SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
648
649 // step to simdvertex batch
650 const uint32_t simdShift = SIMD_WIDTH_LOG2;
651 #if USE_SIMD16_FRONTEND
652 SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
653 this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
654 #else
655 SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
656 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
657 #endif
658
659 // step to index
660 const uint32_t simdMask = SIMD_WIDTH - 1;
661 #if USE_SIMD16_FRONTEND
662 SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
663 this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
664 #else
665 SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
666 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
667 #endif
668 }
669 }
670
671 bool Assemble(uint32_t slot, simdvector *verts)
672 {
673 // process any outstanding verts
674 ProcessVerts();
675
676 // return false if we don't have enough prims assembled
677 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
678 {
679 return false;
680 }
681
682 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
683 if (this->needOffsets)
684 {
685 ComputeOffsets();
686 this->needOffsets = false;
687 }
688
689 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
690 {
691 SIMDSCALARI offsets = this->vOffsets[v];
692
693 // step to attribute
694 #if USE_SIMD16_FRONTEND
695 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
696 #else
697 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
698 #endif
699
700 float* pBase = (float*)this->pStreamBase;
701 for (uint32_t c = 0; c < 4; ++c)
702 {
703 #if USE_SIMD16_FRONTEND
704 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
705
706 verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
707 #else
708 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
709 #endif
710
711 // move base to next component
712 pBase += SIMD_WIDTH;
713 }
714 }
715
716 return true;
717 }
718
719 #if ENABLE_AVX512_SIMD16
720 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
721 {
722 // process any outstanding verts
723 ProcessVerts();
724
725 // return false if we don't have enough prims assembled
726 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
727 {
728 return false;
729 }
730
731 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
732 if (this->needOffsets)
733 {
734 ComputeOffsets();
735 this->needOffsets = false;
736 }
737
738 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
739 {
740 SIMDSCALARI offsets = this->vOffsets[v];
741
742 // step to attribute
743 #if USE_SIMD16_FRONTEND
744 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
745 #else
746 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
747 #endif
748
749 float* pBase = (float*)this->pStreamBase;
750 for (uint32_t c = 0; c < 4; ++c)
751 {
752 #if USE_SIMD16_FRONTEND
753 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
754 #else
755 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
756 #endif
757
758 // move base to next component
759 pBase += SIMD_WIDTH;
760 }
761 }
762
763 return true;
764 }
765
766 #endif
767 void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
768 {
769 // move to slot
770 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
771 {
772 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
773 #if USE_SIMD16_FRONTEND
774 uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
775 #else
776 uint32_t offset = pOffset[triIndex];
777 #endif
778 offset += sizeof(SIMDVECTOR) * slot;
779 float* pVert = (float*)&tri[v];
780 for (uint32_t c = 0; c < 4; ++c)
781 {
782 float* pComponent = (float*)(this->pStreamBase + offset);
783 pVert[c] = *pComponent;
784 offset += SIMD_WIDTH * sizeof(float);
785 }
786 }
787 }
788
789 uint32_t NumPrims()
790 {
791 return this->numPrimsAssembled;
792 }
793
794 // Per-topology functions
795 void ProcessVertTriStrip(uint32_t index, bool finish)
796 {
797 this->vert[this->curIndex] = index;
798 this->curIndex++;
799 if (this->curIndex == 3)
800 {
801 // assembled enough verts for prim, add to gather indices
802 this->indices[0][this->numPrimsAssembled] = this->vert[0];
803 if (reverseWinding)
804 {
805 this->indices[1][this->numPrimsAssembled] = this->vert[2];
806 this->indices[2][this->numPrimsAssembled] = this->vert[1];
807 }
808 else
809 {
810 this->indices[1][this->numPrimsAssembled] = this->vert[1];
811 this->indices[2][this->numPrimsAssembled] = this->vert[2];
812 }
813
814 // increment numPrimsAssembled
815 this->numPrimsAssembled++;
816
817 // set up next prim state
818 this->vert[0] = this->vert[1];
819 this->vert[1] = this->vert[2];
820 this->curIndex = 2;
821 this->reverseWinding ^= 1;
822 }
823 }
824
825 template<bool gsEnabled>
826 void AssembleTriStripAdj()
827 {
828 if (!gsEnabled)
829 {
830 this->vert[1] = this->vert[2];
831 this->vert[2] = this->vert[4];
832
833 this->indices[0][this->numPrimsAssembled] = this->vert[0];
834 this->indices[1][this->numPrimsAssembled] = this->vert[1];
835 this->indices[2][this->numPrimsAssembled] = this->vert[2];
836
837 this->vert[4] = this->vert[2];
838 this->vert[2] = this->vert[1];
839 }
840 else
841 {
842 this->indices[0][this->numPrimsAssembled] = this->vert[0];
843 this->indices[1][this->numPrimsAssembled] = this->vert[1];
844 this->indices[2][this->numPrimsAssembled] = this->vert[2];
845 this->indices[3][this->numPrimsAssembled] = this->vert[3];
846 this->indices[4][this->numPrimsAssembled] = this->vert[4];
847 this->indices[5][this->numPrimsAssembled] = this->vert[5];
848 }
849 this->numPrimsAssembled++;
850 }
851
852
853 template<bool gsEnabled>
854 void ProcessVertTriStripAdj(uint32_t index, bool finish)
855 {
856 // handle last primitive of tristrip
857 if (finish && this->adjExtraVert != -1)
858 {
859 this->vert[3] = this->adjExtraVert;
860 AssembleTriStripAdj<gsEnabled>();
861 this->adjExtraVert = -1;
862 return;
863 }
864
865 switch (this->curIndex)
866 {
867 case 0:
868 case 1:
869 case 2:
870 case 4:
871 this->vert[this->curIndex] = index;
872 this->curIndex++;
873 break;
874 case 3:
875 this->vert[5] = index;
876 this->curIndex++;
877 break;
878 case 5:
879 if (this->adjExtraVert == -1)
880 {
881 this->adjExtraVert = index;
882 }
883 else
884 {
885 this->vert[3] = index;
886 if (!gsEnabled)
887 {
888 AssembleTriStripAdj<gsEnabled>();
889
890 uint32_t nextTri[6];
891 if (this->reverseWinding)
892 {
893 nextTri[0] = this->vert[4];
894 nextTri[1] = this->vert[0];
895 nextTri[2] = this->vert[2];
896 nextTri[4] = this->vert[3];
897 nextTri[5] = this->adjExtraVert;
898 }
899 else
900 {
901 nextTri[0] = this->vert[2];
902 nextTri[1] = this->adjExtraVert;
903 nextTri[2] = this->vert[3];
904 nextTri[4] = this->vert[4];
905 nextTri[5] = this->vert[0];
906 }
907 for (uint32_t i = 0; i < 6; ++i)
908 {
909 this->vert[i] = nextTri[i];
910 }
911
912 this->adjExtraVert = -1;
913 this->reverseWinding ^= 1;
914 }
915 else
916 {
917 this->curIndex++;
918 }
919 }
920 break;
921 case 6:
922 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
923 AssembleTriStripAdj<gsEnabled>();
924
925 uint32_t nextTri[6];
926 if (this->reverseWinding)
927 {
928 nextTri[0] = this->vert[4];
929 nextTri[1] = this->vert[0];
930 nextTri[2] = this->vert[2];
931 nextTri[4] = this->vert[3];
932 nextTri[5] = this->adjExtraVert;
933 }
934 else
935 {
936 nextTri[0] = this->vert[2];
937 nextTri[1] = this->adjExtraVert;
938 nextTri[2] = this->vert[3];
939 nextTri[4] = this->vert[4];
940 nextTri[5] = this->vert[0];
941 }
942 for (uint32_t i = 0; i < 6; ++i)
943 {
944 this->vert[i] = nextTri[i];
945 }
946 this->reverseWinding ^= 1;
947 this->adjExtraVert = index;
948 this->curIndex--;
949 break;
950 }
951 }
952
953 void ProcessVertTriList(uint32_t index, bool finish)
954 {
955 this->vert[this->curIndex] = index;
956 this->curIndex++;
957 if (this->curIndex == 3)
958 {
959 // assembled enough verts for prim, add to gather indices
960 this->indices[0][this->numPrimsAssembled] = this->vert[0];
961 this->indices[1][this->numPrimsAssembled] = this->vert[1];
962 this->indices[2][this->numPrimsAssembled] = this->vert[2];
963
964 // increment numPrimsAssembled
965 this->numPrimsAssembled++;
966
967 // set up next prim state
968 this->curIndex = 0;
969 }
970 }
971
972 void ProcessVertTriListAdj(uint32_t index, bool finish)
973 {
974 this->vert[this->curIndex] = index;
975 this->curIndex++;
976 if (this->curIndex == 6)
977 {
978 // assembled enough verts for prim, add to gather indices
979 this->indices[0][this->numPrimsAssembled] = this->vert[0];
980 this->indices[1][this->numPrimsAssembled] = this->vert[1];
981 this->indices[2][this->numPrimsAssembled] = this->vert[2];
982 this->indices[3][this->numPrimsAssembled] = this->vert[3];
983 this->indices[4][this->numPrimsAssembled] = this->vert[4];
984 this->indices[5][this->numPrimsAssembled] = this->vert[5];
985
986 // increment numPrimsAssembled
987 this->numPrimsAssembled++;
988
989 // set up next prim state
990 this->curIndex = 0;
991 }
992 }
993
994 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
995 {
996 this->vert[this->curIndex] = index;
997 this->curIndex++;
998 if (this->curIndex == 6)
999 {
1000 // assembled enough verts for prim, add to gather indices
1001 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1002 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1003 this->indices[2][this->numPrimsAssembled] = this->vert[4];
1004
1005 // increment numPrimsAssembled
1006 this->numPrimsAssembled++;
1007
1008 // set up next prim state
1009 this->curIndex = 0;
1010 }
1011 }
1012
1013
1014 void ProcessVertLineList(uint32_t index, bool finish)
1015 {
1016 this->vert[this->curIndex] = index;
1017 this->curIndex++;
1018 if (this->curIndex == 2)
1019 {
1020 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1021 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1022
1023 this->numPrimsAssembled++;
1024 this->curIndex = 0;
1025 }
1026 }
1027
1028 void ProcessVertLineStrip(uint32_t index, bool finish)
1029 {
1030 this->vert[this->curIndex] = index;
1031 this->curIndex++;
1032 if (this->curIndex == 2)
1033 {
1034 // assembled enough verts for prim, add to gather indices
1035 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1036 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1037
1038 // increment numPrimsAssembled
1039 this->numPrimsAssembled++;
1040
1041 // set up next prim state
1042 this->vert[0] = this->vert[1];
1043 this->curIndex = 1;
1044 }
1045 }
1046
1047 void ProcessVertLineStripAdj(uint32_t index, bool finish)
1048 {
1049 this->vert[this->curIndex] = index;
1050 this->curIndex++;
1051 if (this->curIndex == 4)
1052 {
1053 // assembled enough verts for prim, add to gather indices
1054 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1055 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1056 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1057 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1058
1059 // increment numPrimsAssembled
1060 this->numPrimsAssembled++;
1061
1062 // set up next prim state
1063 this->vert[0] = this->vert[1];
1064 this->vert[1] = this->vert[2];
1065 this->vert[2] = this->vert[3];
1066 this->curIndex = 3;
1067 }
1068 }
1069
1070 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1071 {
1072 this->vert[this->curIndex] = index;
1073 this->curIndex++;
1074 if (this->curIndex == 4)
1075 {
1076 // assembled enough verts for prim, add to gather indices
1077 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1078 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1079
1080 // increment numPrimsAssembled
1081 this->numPrimsAssembled++;
1082
1083 // set up next prim state
1084 this->vert[0] = this->vert[1];
1085 this->vert[1] = this->vert[2];
1086 this->vert[2] = this->vert[3];
1087 this->curIndex = 3;
1088 }
1089 }
1090
1091 void ProcessVertLineListAdj(uint32_t index, bool finish)
1092 {
1093 this->vert[this->curIndex] = index;
1094 this->curIndex++;
1095 if (this->curIndex == 4)
1096 {
1097 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1098 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1099 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1100 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1101
1102 this->numPrimsAssembled++;
1103 this->curIndex = 0;
1104 }
1105 }
1106
1107 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1108 {
1109 this->vert[this->curIndex] = index;
1110 this->curIndex++;
1111 if (this->curIndex == 4)
1112 {
1113 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1114 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1115
1116 this->numPrimsAssembled++;
1117 this->curIndex = 0;
1118 }
1119 }
1120
1121 void ProcessVertPointList(uint32_t index, bool finish)
1122 {
1123 this->vert[this->curIndex] = index;
1124 this->curIndex++;
1125 if (this->curIndex == 1)
1126 {
1127 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1128 this->numPrimsAssembled++;
1129 this->curIndex = 0;
1130 }
1131 }
1132 };
1133
1134 // Primitive Assembly for data output from the DomainShader.
1135 struct PA_TESS : PA_STATE
1136 {
1137 PA_TESS(
1138 DRAW_CONTEXT *in_pDC,
1139 const SIMDSCALAR* in_pVertData,
1140 uint32_t in_attributeStrideInVectors,
1141 uint32_t in_vertexStride,
1142 uint32_t in_numAttributes,
1143 uint32_t* (&in_ppIndices)[3],
1144 uint32_t in_numPrims,
1145 PRIMITIVE_TOPOLOGY in_binTopology) :
1146
1147 PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
1148 m_pVertexData(in_pVertData),
1149 m_attributeStrideInVectors(in_attributeStrideInVectors),
1150 m_numAttributes(in_numAttributes),
1151 m_numPrims(in_numPrims)
1152 {
1153 #if USE_SIMD16_FRONTEND
1154 m_vPrimId = _simd16_setzero_si();
1155 #else
1156 m_vPrimId = _simd_setzero_si();
1157 #endif
1158 binTopology = in_binTopology;
1159 m_ppIndices[0] = in_ppIndices[0];
1160 m_ppIndices[1] = in_ppIndices[1];
1161 m_ppIndices[2] = in_ppIndices[2];
1162
1163 switch (binTopology)
1164 {
1165 case TOP_POINT_LIST:
1166 m_numVertsPerPrim = 1;
1167 break;
1168
1169 case TOP_LINE_LIST:
1170 m_numVertsPerPrim = 2;
1171 break;
1172
1173 case TOP_TRIANGLE_LIST:
1174 m_numVertsPerPrim = 3;
1175 break;
1176
1177 default:
1178 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1179 break;
1180 }
1181 }
1182
1183 bool HasWork()
1184 {
1185 return m_numPrims != 0;
1186 }
1187
1188 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1189 {
1190 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1191 return junkVector;
1192 }
1193
1194 #if ENABLE_AVX512_SIMD16
1195 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1196 {
1197 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1198 return junkVector_simd16;
1199 }
1200
1201 #endif
1202 static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1203 {
1204 SWR_ASSERT(numPrims <= SIMD_WIDTH);
1205 #if USE_SIMD16_FRONTEND
1206 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1207 {
1208 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1210 };
1211
1212 return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1213 #else
1214 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1215 {
1216 -1, -1, -1, -1, -1, -1, -1, -1,
1217 0, 0, 0, 0, 0, 0, 0, 0
1218 };
1219
1220 return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1221 #endif
1222 }
1223
1224 bool Assemble(uint32_t slot, simdvector verts[])
1225 {
1226 SWR_ASSERT(slot < m_numAttributes);
1227
1228 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1229 if (0 == numPrimsToAssemble)
1230 {
1231 return false;
1232 }
1233
1234 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1235
1236 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1237 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1238 {
1239 #if USE_SIMD16_FRONTEND
1240 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1241 #else
1242 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1243 #endif
1244
1245 const float* pBase = pBaseAttrib;
1246 for (uint32_t c = 0; c < 4; ++c)
1247 {
1248 #if USE_SIMD16_FRONTEND
1249 simd16scalar temp = _simd16_mask_i32gather_ps(
1250 _simd16_setzero_ps(),
1251 pBase,
1252 indices,
1253 _simd16_castsi_ps(mask),
1254 4 /* gcc doesn't like sizeof(float) */);
1255
1256 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1257 #else
1258 verts[i].v[c] = _simd_mask_i32gather_ps(
1259 _simd_setzero_ps(),
1260 pBase,
1261 indices,
1262 _simd_castsi_ps(mask),
1263 4); // gcc doesn't like sizeof(float)
1264 #endif
1265 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1266 }
1267 }
1268
1269 return true;
1270 }
1271
1272 #if ENABLE_AVX512_SIMD16
1273 bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1274 {
1275 SWR_ASSERT(slot < m_numAttributes);
1276
1277 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1278 if (0 == numPrimsToAssemble)
1279 {
1280 return false;
1281 }
1282
1283 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1284
1285 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1286 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1287 {
1288 #if USE_SIMD16_FRONTEND
1289 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1290 #else
1291 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1292 #endif
1293
1294 const float* pBase = pBaseAttrib;
1295 for (uint32_t c = 0; c < 4; ++c)
1296 {
1297 #if USE_SIMD16_FRONTEND
1298 verts[i].v[c] = _simd16_mask_i32gather_ps(
1299 _simd16_setzero_ps(),
1300 pBase,
1301 indices,
1302 _simd16_castsi_ps(mask),
1303 4 /* gcc doesn't like sizeof(float) */);
1304 #else
1305 simdscalar temp = _simd_mask_i32gather_ps(
1306 _simd_setzero_ps(),
1307 pBase,
1308 indices,
1309 _simd_castsi_ps(mask),
1310 4 /* gcc doesn't like sizeof(float) */);
1311 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1312 #endif
1313 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1314 }
1315 }
1316
1317 return true;
1318 }
1319
1320 #endif
1321 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1322 {
1323 SWR_ASSERT(slot < m_numAttributes);
1324 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1325
1326 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1327 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1328 {
1329 #if USE_SIMD16_FRONTEND
1330 uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1331 #else
1332 uint32_t index = m_ppIndices[i][primIndex];
1333 #endif
1334 const float* pVertData = pVertDataBase;
1335 float* pVert = (float*)&verts[i];
1336
1337 for (uint32_t c = 0; c < 4; ++c)
1338 {
1339 pVert[c] = pVertData[index];
1340 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1341 }
1342 }
1343 }
1344
1345 bool NextPrim()
1346 {
1347 uint32_t numPrims = PA_TESS::NumPrims();
1348 m_numPrims -= numPrims;
1349 m_ppIndices[0] += numPrims;
1350 m_ppIndices[1] += numPrims;
1351 m_ppIndices[2] += numPrims;
1352
1353 return HasWork();
1354 }
1355
1356 SIMDVERTEX& GetNextVsOutput()
1357 {
1358 SWR_NOT_IMPL;
1359 return junkVertex;
1360 }
1361
1362 bool GetNextStreamOutput()
1363 {
1364 SWR_NOT_IMPL;
1365 return false;
1366 }
1367
1368 SIMDMASK& GetNextVsIndices()
1369 {
1370 SWR_NOT_IMPL;
1371 return junkIndices;
1372 }
1373
1374 uint32_t NumPrims()
1375 {
1376 return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1377 }
1378
1379 void Reset()
1380 {
1381 SWR_NOT_IMPL;
1382 }
1383
1384 SIMDSCALARI GetPrimID(uint32_t startID)
1385 {
1386 #if USE_SIMD16_FRONTEND
1387 return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1388 #else
1389 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1390 #endif
1391 }
1392
1393 private:
1394 const SIMDSCALAR* m_pVertexData = nullptr;
1395 uint32_t m_attributeStrideInVectors = 0;
1396 uint32_t m_numAttributes = 0;
1397 uint32_t m_numPrims = 0;
1398 uint32_t* m_ppIndices[3];
1399
1400 uint32_t m_numVertsPerPrim = 0;
1401
1402 SIMDSCALARI m_vPrimId;
1403
1404 simdvector junkVector; // junk simdvector for unimplemented API
1405 #if ENABLE_AVX512_SIMD16
1406 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
1407 #endif
1408 SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
1409 SIMDMASK junkIndices; // temporary index store for unused virtual function
1410 };
1411
1412 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1413 // based on state.
1414 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1415 struct PA_FACTORY
1416 {
1417 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
1418 {
1419 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1420 const API_STATE& state = GetApiState(pDC);
1421 if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1422 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1423 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1424 topo == TOP_TRIANGLE_LIST)) ||
1425
1426 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1427 // for them in the optimized PA
1428 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1429 {
1430 memset(&indexStore, 0, sizeof(indexStore));
1431 uint32_t numAttribs = state.feNumAttributes;
1432
1433 new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1434 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1435 cutPA = true;
1436 }
1437 else
1438 #endif
1439 {
1440 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1441 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
1442 cutPA = false;
1443 }
1444
1445 }
1446
1447 PA_STATE& GetPA()
1448 {
1449 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1450 if (cutPA)
1451 {
1452 return this->paCut;
1453 }
1454 else
1455 #endif
1456 {
1457 return this->paOpt;
1458 }
1459 }
1460
1461 PA_STATE_OPT paOpt;
1462 PA_STATE_CUT paCut;
1463
1464 bool cutPA{ false };
1465
1466 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1467
1468 PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1469 };