swr: [rasterizer] Fix Coverity issues reported by Mesa developers.
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 DRAW_CONTEXT *pDC{ nullptr }; // draw context
38 uint8_t* pStreamBase{ nullptr }; // vertex stream
39 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
40
41 // The topology the binner will use. In some cases the FE changes the topology from the api state.
42 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
43
44 PA_STATE() {}
45 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
46 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
47
48 virtual bool HasWork() = 0;
49 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
50 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
51 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
52 virtual bool NextPrim() = 0;
53 virtual simdvertex& GetNextVsOutput() = 0;
54 virtual bool GetNextStreamOutput() = 0;
55 virtual simdmask& GetNextVsIndices() = 0;
56 virtual uint32_t NumPrims() = 0;
57 virtual void Reset() = 0;
58 virtual simdscalari GetPrimID(uint32_t startID) = 0;
59 };
60
61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
62 // output. Here is the sequence
63 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
64 // 2. Execute PA function to assemble and bin triangles.
65 // a. The PA function is a set of functions that collectively make up the
66 // state machine for a given topology.
67 // 1. We use a state index to track which PA function to call.
68 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
69 // 1. We call this the current and previous simd vertex.
70 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
71 // order to assemble the second triangle, for a triangle list, we'll need the
72 // last vertex from the previous simd and the first 2 vertices from the current simd.
73 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
74 //
75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
76 // cuts
77 struct PA_STATE_OPT : public PA_STATE
78 {
79 simdvertex leadingVertex; // For tri-fan
80 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
81 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
82
83 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
84
85 uint32_t cur{ 0 }; // index to current VS output.
86 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
87 uint32_t first{ 0 }; // index to first VS output. Used for trifan.
88
89 uint32_t counter{ 0 }; // state counter
90 bool reset{ false }; // reset state
91
92 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
93 simdscalari primID;
94
95 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
96 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
97
98 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
99 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
100 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
101
102 // state used to advance the PA when Next is called
103 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
104 uint32_t nextNumSimdPrims{ 0 };
105 uint32_t nextNumPrimsIncrement{ 0 };
106 bool nextReset{ false };
107 bool isStreaming{ false };
108
109 simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function
110
111 PA_STATE_OPT() {}
112 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
113 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
114
115 bool HasWork()
116 {
117 return (this->numPrimsComplete < this->numPrims) ? true : false;
118 }
119
120 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
121 {
122 simdvertex* pVertex = (simdvertex*)pStreamBase;
123 return pVertex[index].attrib[slot];
124 }
125
126 // Assembles 4 triangles. Each simdvector is a single vertex from 4
127 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
128 bool Assemble(uint32_t slot, simdvector verts[])
129 {
130 return this->pfnPaFunc(*this, slot, verts);
131 }
132
133 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
134 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
135 {
136 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
137 }
138
139 bool NextPrim()
140 {
141 this->pfnPaFunc = this->pfnPaNextFunc;
142 this->numSimdPrims = this->nextNumSimdPrims;
143 this->numPrimsComplete += this->nextNumPrimsIncrement;
144 this->reset = this->nextReset;
145
146 if (this->isStreaming)
147 {
148 this->reset = false;
149 }
150
151 bool morePrims = false;
152
153 if (this->numSimdPrims > 0)
154 {
155 morePrims = true;
156 this->numSimdPrims--;
157 }
158 else
159 {
160 this->counter = (this->reset) ? 0 : (this->counter + 1);
161 this->reset = false;
162 }
163
164 this->pfnPaFunc = this->pfnPaNextFunc;
165
166 if (!HasWork())
167 {
168 morePrims = false; // no more to do
169 }
170
171 return morePrims;
172 }
173
174 simdvertex& GetNextVsOutput()
175 {
176 // increment cur and prev indices
177 const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
178 this->prev = this->cur; // prev is undefined for first state.
179 this->cur = this->counter % numSimdVerts;
180
181 simdvertex* pVertex = (simdvertex*)pStreamBase;
182 return pVertex[this->cur];
183 }
184
185 simdmask& GetNextVsIndices()
186 {
187 // unused in optimized PA, pass tmp buffer back
188 return tmpIndices;
189 }
190
191 bool GetNextStreamOutput()
192 {
193 this->prev = this->cur;
194 this->cur = this->counter;
195
196 return HasWork();
197 }
198
199 uint32_t NumPrims()
200 {
201 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
202 (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
203 }
204
205 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
206 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
207 uint32_t numSimdPrims = 0,
208 uint32_t numPrimsIncrement = 0,
209 bool reset = false)
210 {
211 this->pfnPaNextFunc = pfnPaNextFunc;
212 this->nextNumSimdPrims = numSimdPrims;
213 this->nextNumPrimsIncrement = numPrimsIncrement;
214 this->nextReset = reset;
215
216 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
217 }
218
219 void Reset()
220 {
221 this->pfnPaFunc = this->pfnPaFuncReset;
222 this->numPrimsComplete = 0;
223 this->numSimdPrims = 0;
224 this->cur = 0;
225 this->prev = 0;
226 this->first = 0;
227 this->counter = 0;
228 this->reset = false;
229 }
230
231 simdscalari GetPrimID(uint32_t startID)
232 {
233 return _simd_add_epi32(this->primID,
234 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
235 }
236 };
237
238 // helper C wrappers to avoid having to rewrite all the PA topology state functions
239 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
240 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
241 uint32_t numSimdPrims = 0,
242 uint32_t numPrimsIncrement = 0,
243 bool reset = false)
244 {
245 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
246 }
247 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
248 {
249 return pa.GetSimdVector(index, slot);
250 }
251
252 INLINE __m128 swizzleLane0(const simdvector &a)
253 {
254 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
255 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
256 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
257 }
258
259 INLINE __m128 swizzleLane1(const simdvector &a)
260 {
261 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
262 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
263 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
264 }
265
266 INLINE __m128 swizzleLane2(const simdvector &a)
267 {
268 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
269 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
270 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
271 }
272
273 INLINE __m128 swizzleLane3(const simdvector &a)
274 {
275 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
276 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
277 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
278 }
279
280 INLINE __m128 swizzleLane4(const simdvector &a)
281 {
282 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
283 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
284 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
285
286 }
287
288 INLINE __m128 swizzleLane5(const simdvector &a)
289 {
290 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
291 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
292 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
293 }
294
295 INLINE __m128 swizzleLane6(const simdvector &a)
296 {
297 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
298 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
299 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
300 }
301
302 INLINE __m128 swizzleLane7(const simdvector &a)
303 {
304 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
305 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
306 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
307 }
308
309 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
310 {
311 switch (lane) {
312 case 0:
313 return swizzleLane0(a);
314 case 1:
315 return swizzleLane1(a);
316 case 2:
317 return swizzleLane2(a);
318 case 3:
319 return swizzleLane3(a);
320 case 4:
321 return swizzleLane4(a);
322 case 5:
323 return swizzleLane5(a);
324 case 6:
325 return swizzleLane6(a);
326 case 7:
327 return swizzleLane7(a);
328 default:
329 return _mm_setzero_ps();
330 }
331 }
332
333 // Cut-aware primitive assembler.
334 struct PA_STATE_CUT : public PA_STATE
335 {
336 simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
337 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
338 uint32_t numAttribs{ 0 }; // number of attributes
339 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
340 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
341 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather
342 simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
343 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
344 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
345 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
346 uint32_t curVertex{ 0 }; // current unprocessed vertex
347 uint32_t startPrimId{ 0 }; // starting prim id
348 simdscalari vPrimId; // vector of prim ID
349 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
350 uint32_t vertsPerPrim{ 0 };
351 simdvertex tmpVertex; // temporary simdvertex for unimplemented API
352 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
353 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
354 // while the GS sends valid verts for every index
355 // Topology state tracking
356 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
357 uint32_t curIndex{ 0 };
358 bool reverseWinding{ false }; // indicates reverse winding for strips
359 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
360
361 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
362 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
363
364 PA_STATE_CUT() {}
365 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
366 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
367 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
368 {
369 numVerts = in_streamSizeInVerts;
370 numAttribs = in_numAttribs;
371 binTopology = topo;
372 needOffsets = false;
373 processCutVerts = in_processCutVerts;
374
375 numVertsToAssemble = numRemainingVerts = in_numVerts;
376 numPrimsAssembled = 0;
377 headVertex = tailVertex = curVertex = 0;
378
379 curIndex = 0;
380 pCutIndices = in_pIndices;
381 memset(indices, 0, sizeof(indices));
382 vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
383 reverseWinding = false;
384 adjExtraVert = -1;
385
386 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
387 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
388
389 switch (topo)
390 {
391 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
392 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
393 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
394 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
395 {
396 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
397 }
398 else
399 {
400 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
401 }
402 break;
403
404 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
405 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
406 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
407 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
408 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
409 default: assert(0 && "Unimplemented topology");
410 }
411 }
412
413 simdvertex& GetNextVsOutput()
414 {
415 uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
416 this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
417 this->needOffsets = true;
418 return ((simdvertex*)pStreamBase)[vertexIndex];
419 }
420
421 simdmask& GetNextVsIndices()
422 {
423 uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
424 simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
425 return *pCurCutIndex;
426 }
427
428 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
429 {
430 // unused
431 SWR_ASSERT(0 && "Not implemented");
432 return this->tmpVertex.attrib[0];
433 }
434
435 bool GetNextStreamOutput()
436 {
437 this->headVertex += KNOB_SIMD_WIDTH;
438 this->needOffsets = true;
439 return HasWork();
440 }
441
442 simdscalari GetPrimID(uint32_t startID)
443 {
444 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
445 }
446
447 void Reset()
448 {
449 this->numRemainingVerts = this->numVertsToAssemble;
450 this->numPrimsAssembled = 0;
451 this->curIndex = 0;
452 this->curVertex = 0;
453 this->tailVertex = 0;
454 this->headVertex = 0;
455 this->reverseWinding = false;
456 this->adjExtraVert = -1;
457 this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
458 }
459
460 bool HasWork()
461 {
462 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
463 }
464
465 bool IsVertexStoreFull()
466 {
467 return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
468 }
469
470 void RestartTopology()
471 {
472 this->curIndex = 0;
473 this->reverseWinding = false;
474 this->adjExtraVert = -1;
475 }
476
477 bool IsCutIndex(uint32_t vertex)
478 {
479 uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
480 uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
481 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
482 }
483
484 // iterates across the unprocessed verts until we hit the end or we
485 // have assembled SIMD prims
486 void ProcessVerts()
487 {
488 while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
489 this->numRemainingVerts > 0 &&
490 this->curVertex != this->headVertex)
491 {
492 // if cut index, restart topology
493 if (IsCutIndex(this->curVertex))
494 {
495 if (this->processCutVerts)
496 {
497 (this->*pfnPa)(this->curVertex, false);
498 }
499 // finish off tri strip w/ adj before restarting topo
500 if (this->adjExtraVert != -1)
501 {
502 (this->*pfnPa)(this->curVertex, true);
503 }
504 RestartTopology();
505 }
506 else
507 {
508 (this->*pfnPa)(this->curVertex, false);
509 }
510
511 this->curVertex = (this->curVertex + 1) % this->numVerts;
512 this->numRemainingVerts--;
513 }
514
515 // special case last primitive for tri strip w/ adj
516 if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
517 {
518 (this->*pfnPa)(this->curVertex, true);
519 }
520 }
521
522 void Advance()
523 {
524 // done with current batch
525 // advance tail to the current unsubmitted vertex
526 this->tailVertex = this->curVertex;
527 this->numPrimsAssembled = 0;
528 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
529 }
530
531 bool NextPrim()
532 {
533 // if we've assembled enough prims, we can advance to the next set of verts
534 if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
535 {
536 Advance();
537 }
538 return false;
539 }
540
541 void ComputeOffsets()
542 {
543 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
544 {
545 simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
546
547 // step to simdvertex batch
548 const uint32_t simdShift = 3; // @todo make knob
549 simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
550 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
551
552 // step to index
553 const uint32_t simdMask = 0x7; // @todo make knob
554 simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
555 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
556 }
557 }
558
559 bool Assemble(uint32_t slot, simdvector result[])
560 {
561 // process any outstanding verts
562 ProcessVerts();
563
564 // return false if we don't have enough prims assembled
565 if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
566 {
567 return false;
568 }
569
570 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
571 if (this->needOffsets)
572 {
573 ComputeOffsets();
574 this->needOffsets = false;
575 }
576
577 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
578 {
579 simdscalari offsets = this->vOffsets[v];
580
581 // step to attribute
582 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
583
584 float* pBase = (float*)this->pStreamBase;
585 for (uint32_t c = 0; c < 4; ++c)
586 {
587 result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
588
589 // move base to next component
590 pBase += KNOB_SIMD_WIDTH;
591 }
592 }
593
594 return true;
595 }
596
597 void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
598 {
599 // move to slot
600 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
601 {
602 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
603 uint32_t offset = pOffset[triIndex];
604 offset += sizeof(simdvector) * slot;
605 float* pVert = (float*)&tri[v];
606 for (uint32_t c = 0; c < 4; ++c)
607 {
608 float* pComponent = (float*)(this->pStreamBase + offset);
609 pVert[c] = *pComponent;
610 offset += KNOB_SIMD_WIDTH * sizeof(float);
611 }
612 }
613 }
614
615 uint32_t NumPrims()
616 {
617 return this->numPrimsAssembled;
618 }
619
620 // Per-topology functions
621 void ProcessVertTriStrip(uint32_t index, bool finish)
622 {
623 this->vert[this->curIndex] = index;
624 this->curIndex++;
625 if (this->curIndex == 3)
626 {
627 // assembled enough verts for prim, add to gather indices
628 this->indices[0][this->numPrimsAssembled] = this->vert[0];
629 if (reverseWinding)
630 {
631 this->indices[1][this->numPrimsAssembled] = this->vert[2];
632 this->indices[2][this->numPrimsAssembled] = this->vert[1];
633 }
634 else
635 {
636 this->indices[1][this->numPrimsAssembled] = this->vert[1];
637 this->indices[2][this->numPrimsAssembled] = this->vert[2];
638 }
639
640 // increment numPrimsAssembled
641 this->numPrimsAssembled++;
642
643 // set up next prim state
644 this->vert[0] = this->vert[1];
645 this->vert[1] = this->vert[2];
646 this->curIndex = 2;
647 this->reverseWinding ^= 1;
648 }
649 }
650
651 template<bool gsEnabled>
652 void AssembleTriStripAdj()
653 {
654 if (!gsEnabled)
655 {
656 this->vert[1] = this->vert[2];
657 this->vert[2] = this->vert[4];
658
659 this->indices[0][this->numPrimsAssembled] = this->vert[0];
660 this->indices[1][this->numPrimsAssembled] = this->vert[1];
661 this->indices[2][this->numPrimsAssembled] = this->vert[2];
662
663 this->vert[4] = this->vert[2];
664 this->vert[2] = this->vert[1];
665 }
666 else
667 {
668 this->indices[0][this->numPrimsAssembled] = this->vert[0];
669 this->indices[1][this->numPrimsAssembled] = this->vert[1];
670 this->indices[2][this->numPrimsAssembled] = this->vert[2];
671 this->indices[3][this->numPrimsAssembled] = this->vert[3];
672 this->indices[4][this->numPrimsAssembled] = this->vert[4];
673 this->indices[5][this->numPrimsAssembled] = this->vert[5];
674 }
675 this->numPrimsAssembled++;
676 }
677
678
679 template<bool gsEnabled>
680 void ProcessVertTriStripAdj(uint32_t index, bool finish)
681 {
682 // handle last primitive of tristrip
683 if (finish && this->adjExtraVert != -1)
684 {
685 this->vert[3] = this->adjExtraVert;
686 AssembleTriStripAdj<gsEnabled>();
687 this->adjExtraVert = -1;
688 return;
689 }
690
691 switch (this->curIndex)
692 {
693 case 0:
694 case 1:
695 case 2:
696 case 4:
697 this->vert[this->curIndex] = index;
698 this->curIndex++;
699 break;
700 case 3:
701 this->vert[5] = index;
702 this->curIndex++;
703 break;
704 case 5:
705 if (this->adjExtraVert == -1)
706 {
707 this->adjExtraVert = index;
708 }
709 else
710 {
711 this->vert[3] = index;
712 if (!gsEnabled)
713 {
714 AssembleTriStripAdj<gsEnabled>();
715
716 uint32_t nextTri[6];
717 if (this->reverseWinding)
718 {
719 nextTri[0] = this->vert[4];
720 nextTri[1] = this->vert[0];
721 nextTri[2] = this->vert[2];
722 nextTri[4] = this->vert[3];
723 nextTri[5] = this->adjExtraVert;
724 }
725 else
726 {
727 nextTri[0] = this->vert[2];
728 nextTri[1] = this->adjExtraVert;
729 nextTri[2] = this->vert[3];
730 nextTri[4] = this->vert[4];
731 nextTri[5] = this->vert[0];
732 }
733 for (uint32_t i = 0; i < 6; ++i)
734 {
735 this->vert[i] = nextTri[i];
736 }
737
738 this->adjExtraVert = -1;
739 this->reverseWinding ^= 1;
740 }
741 else
742 {
743 this->curIndex++;
744 }
745 }
746 break;
747 case 6:
748 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
749 AssembleTriStripAdj<gsEnabled>();
750
751 uint32_t nextTri[6];
752 if (this->reverseWinding)
753 {
754 nextTri[0] = this->vert[4];
755 nextTri[1] = this->vert[0];
756 nextTri[2] = this->vert[2];
757 nextTri[4] = this->vert[3];
758 nextTri[5] = this->adjExtraVert;
759 }
760 else
761 {
762 nextTri[0] = this->vert[2];
763 nextTri[1] = this->adjExtraVert;
764 nextTri[2] = this->vert[3];
765 nextTri[4] = this->vert[4];
766 nextTri[5] = this->vert[0];
767 }
768 for (uint32_t i = 0; i < 6; ++i)
769 {
770 this->vert[i] = nextTri[i];
771 }
772 this->reverseWinding ^= 1;
773 this->adjExtraVert = index;
774 this->curIndex--;
775 break;
776 }
777 }
778
779 void ProcessVertTriList(uint32_t index, bool finish)
780 {
781 this->vert[this->curIndex] = index;
782 this->curIndex++;
783 if (this->curIndex == 3)
784 {
785 // assembled enough verts for prim, add to gather indices
786 this->indices[0][this->numPrimsAssembled] = this->vert[0];
787 this->indices[1][this->numPrimsAssembled] = this->vert[1];
788 this->indices[2][this->numPrimsAssembled] = this->vert[2];
789
790 // increment numPrimsAssembled
791 this->numPrimsAssembled++;
792
793 // set up next prim state
794 this->curIndex = 0;
795 }
796 }
797
798 void ProcessVertTriListAdj(uint32_t index, bool finish)
799 {
800 this->vert[this->curIndex] = index;
801 this->curIndex++;
802 if (this->curIndex == 6)
803 {
804 // assembled enough verts for prim, add to gather indices
805 this->indices[0][this->numPrimsAssembled] = this->vert[0];
806 this->indices[1][this->numPrimsAssembled] = this->vert[1];
807 this->indices[2][this->numPrimsAssembled] = this->vert[2];
808 this->indices[3][this->numPrimsAssembled] = this->vert[3];
809 this->indices[4][this->numPrimsAssembled] = this->vert[4];
810 this->indices[5][this->numPrimsAssembled] = this->vert[5];
811
812 // increment numPrimsAssembled
813 this->numPrimsAssembled++;
814
815 // set up next prim state
816 this->curIndex = 0;
817 }
818 }
819
820 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
821 {
822 this->vert[this->curIndex] = index;
823 this->curIndex++;
824 if (this->curIndex == 6)
825 {
826 // assembled enough verts for prim, add to gather indices
827 this->indices[0][this->numPrimsAssembled] = this->vert[0];
828 this->indices[1][this->numPrimsAssembled] = this->vert[2];
829 this->indices[2][this->numPrimsAssembled] = this->vert[4];
830
831 // increment numPrimsAssembled
832 this->numPrimsAssembled++;
833
834 // set up next prim state
835 this->curIndex = 0;
836 }
837 }
838
839
840 void ProcessVertLineList(uint32_t index, bool finish)
841 {
842 this->vert[this->curIndex] = index;
843 this->curIndex++;
844 if (this->curIndex == 2)
845 {
846 this->indices[0][this->numPrimsAssembled] = this->vert[0];
847 this->indices[1][this->numPrimsAssembled] = this->vert[1];
848
849 this->numPrimsAssembled++;
850 this->curIndex = 0;
851 }
852 }
853
854 void ProcessVertLineStrip(uint32_t index, bool finish)
855 {
856 this->vert[this->curIndex] = index;
857 this->curIndex++;
858 if (this->curIndex == 2)
859 {
860 // assembled enough verts for prim, add to gather indices
861 this->indices[0][this->numPrimsAssembled] = this->vert[0];
862 this->indices[1][this->numPrimsAssembled] = this->vert[1];
863
864 // increment numPrimsAssembled
865 this->numPrimsAssembled++;
866
867 // set up next prim state
868 this->vert[0] = this->vert[1];
869 this->curIndex = 1;
870 }
871 }
872
873 void ProcessVertLineStripAdj(uint32_t index, bool finish)
874 {
875 this->vert[this->curIndex] = index;
876 this->curIndex++;
877 if (this->curIndex == 4)
878 {
879 // assembled enough verts for prim, add to gather indices
880 this->indices[0][this->numPrimsAssembled] = this->vert[0];
881 this->indices[1][this->numPrimsAssembled] = this->vert[1];
882 this->indices[2][this->numPrimsAssembled] = this->vert[2];
883 this->indices[3][this->numPrimsAssembled] = this->vert[3];
884
885 // increment numPrimsAssembled
886 this->numPrimsAssembled++;
887
888 // set up next prim state
889 this->vert[0] = this->vert[1];
890 this->vert[1] = this->vert[2];
891 this->vert[2] = this->vert[3];
892 this->curIndex = 3;
893 }
894 }
895
896 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
897 {
898 this->vert[this->curIndex] = index;
899 this->curIndex++;
900 if (this->curIndex == 4)
901 {
902 // assembled enough verts for prim, add to gather indices
903 this->indices[0][this->numPrimsAssembled] = this->vert[1];
904 this->indices[1][this->numPrimsAssembled] = this->vert[2];
905
906 // increment numPrimsAssembled
907 this->numPrimsAssembled++;
908
909 // set up next prim state
910 this->vert[0] = this->vert[1];
911 this->vert[1] = this->vert[2];
912 this->vert[2] = this->vert[3];
913 this->curIndex = 3;
914 }
915 }
916
917 void ProcessVertLineListAdj(uint32_t index, bool finish)
918 {
919 this->vert[this->curIndex] = index;
920 this->curIndex++;
921 if (this->curIndex == 4)
922 {
923 this->indices[0][this->numPrimsAssembled] = this->vert[0];
924 this->indices[1][this->numPrimsAssembled] = this->vert[1];
925 this->indices[2][this->numPrimsAssembled] = this->vert[2];
926 this->indices[3][this->numPrimsAssembled] = this->vert[3];
927
928 this->numPrimsAssembled++;
929 this->curIndex = 0;
930 }
931 }
932
933 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
934 {
935 this->vert[this->curIndex] = index;
936 this->curIndex++;
937 if (this->curIndex == 4)
938 {
939 this->indices[0][this->numPrimsAssembled] = this->vert[1];
940 this->indices[1][this->numPrimsAssembled] = this->vert[2];
941
942 this->numPrimsAssembled++;
943 this->curIndex = 0;
944 }
945 }
946
947 void ProcessVertPointList(uint32_t index, bool finish)
948 {
949 this->vert[this->curIndex] = index;
950 this->curIndex++;
951 if (this->curIndex == 1)
952 {
953 this->indices[0][this->numPrimsAssembled] = this->vert[0];
954 this->numPrimsAssembled++;
955 this->curIndex = 0;
956 }
957 }
958 };
959
960 // Primitive Assembly for data output from the DomainShader.
961 struct PA_TESS : PA_STATE
962 {
963 PA_TESS(
964 DRAW_CONTEXT *in_pDC,
965 const simdscalar* in_pVertData,
966 uint32_t in_attributeStrideInVectors,
967 uint32_t in_numAttributes,
968 uint32_t* (&in_ppIndices)[3],
969 uint32_t in_numPrims,
970 PRIMITIVE_TOPOLOGY in_binTopology) :
971
972 PA_STATE(in_pDC, nullptr, 0),
973 m_pVertexData(in_pVertData),
974 m_attributeStrideInVectors(in_attributeStrideInVectors),
975 m_numAttributes(in_numAttributes),
976 m_numPrims(in_numPrims)
977 {
978 m_vPrimId = _simd_setzero_si();
979 binTopology = in_binTopology;
980 m_ppIndices[0] = in_ppIndices[0];
981 m_ppIndices[1] = in_ppIndices[1];
982 m_ppIndices[2] = in_ppIndices[2];
983
984 switch (binTopology)
985 {
986 case TOP_POINT_LIST:
987 m_numVertsPerPrim = 1;
988 break;
989
990 case TOP_LINE_LIST:
991 m_numVertsPerPrim = 2;
992 break;
993
994 case TOP_TRIANGLE_LIST:
995 m_numVertsPerPrim = 3;
996 break;
997
998 default:
999 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1000 break;
1001 }
1002 }
1003
1004 bool HasWork()
1005 {
1006 return m_numPrims != 0;
1007 }
1008
1009 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1010 {
1011 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1012 static simdvector junk = { 0 };
1013 return junk;
1014 }
1015
1016 static simdscalari GenPrimMask(uint32_t numPrims)
1017 {
1018 SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
1019 #if KNOB_SIMD_WIDTH == 8
1020 static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
1021 {
1022 -1, -1, -1, -1, -1, -1, -1, -1,
1023 0, 0, 0, 0, 0, 0, 0, 0
1024 };
1025 #elif KNOB_SIMD_WIDTH == 16
1026 static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
1027 {
1028 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1030 };
1031 #else
1032 #error "Help, help, I can't get up!"
1033 #endif
1034
1035 return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
1036 }
1037
1038 bool Assemble(uint32_t slot, simdvector verts[])
1039 {
1040 static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
1041 SWR_ASSERT(slot < m_numAttributes);
1042
1043 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1044 if (0 == numPrimsToAssemble)
1045 {
1046 return false;
1047 }
1048
1049 simdscalari mask = GenPrimMask(numPrimsToAssemble);
1050
1051 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1052 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1053 {
1054 simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
1055
1056 const float* pBase = pBaseAttrib;
1057 for (uint32_t c = 0; c < 4; ++c)
1058 {
1059 verts[i].v[c] = _simd_mask_i32gather_ps(
1060 _simd_setzero_ps(),
1061 pBase,
1062 indices,
1063 _simd_castsi_ps(mask),
1064 4 /* gcc doesn't like sizeof(float) */);
1065 pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1066 }
1067 }
1068
1069 return true;
1070 }
1071
1072 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1073 {
1074 SWR_ASSERT(slot < m_numAttributes);
1075 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1076
1077 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1078 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1079 {
1080 uint32_t index = m_ppIndices[i][primIndex];
1081 const float* pVertData = pVertDataBase;
1082 float* pVert = (float*)&verts[i];
1083
1084 for (uint32_t c = 0; c < 4; ++c)
1085 {
1086 pVert[c] = pVertData[index];
1087 pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1088 }
1089 }
1090 }
1091
1092 bool NextPrim()
1093 {
1094 uint32_t numPrims = PA_TESS::NumPrims();
1095 m_numPrims -= numPrims;
1096 m_ppIndices[0] += numPrims;
1097 m_ppIndices[1] += numPrims;
1098 m_ppIndices[2] += numPrims;
1099
1100 return HasWork();
1101 }
1102
1103 simdvertex& GetNextVsOutput()
1104 {
1105 SWR_ASSERT(0, "%s", __FUNCTION__);
1106 static simdvertex junk;
1107 return junk;
1108 }
1109
1110 bool GetNextStreamOutput()
1111 {
1112 SWR_ASSERT(0, "%s", __FUNCTION__);
1113 return false;
1114 }
1115
1116 simdmask& GetNextVsIndices()
1117 {
1118 SWR_ASSERT(0, "%s", __FUNCTION__);
1119 static simdmask junk;
1120 return junk;
1121 }
1122
1123 uint32_t NumPrims()
1124 {
1125 return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
1126 }
1127
1128 void Reset() { SWR_ASSERT(0); };
1129
1130 simdscalari GetPrimID(uint32_t startID)
1131 {
1132 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1133 }
1134
1135 private:
1136 const simdscalar* m_pVertexData = nullptr;
1137 uint32_t m_attributeStrideInVectors = 0;
1138 uint32_t m_numAttributes = 0;
1139 uint32_t m_numPrims = 0;
1140 uint32_t* m_ppIndices[3];
1141
1142 uint32_t m_numVertsPerPrim = 0;
1143
1144 simdscalari m_vPrimId;
1145 };
1146
1147 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1148 // based on state.
1149 template <bool IsIndexedT>
1150 struct PA_FACTORY
1151 {
1152 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1153 {
1154 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1155 const API_STATE& state = GetApiState(pDC);
1156 if ((IsIndexedT && (
1157 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1158 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1159 topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ ||
1160 topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
1161 topo == TOP_TRI_STRIP_ADJ)) ||
1162
1163 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1164 // for them in the optimized PA
1165 (!IsIndexedT && (
1166 topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)))
1167 {
1168 memset(&indexStore, 0, sizeof(indexStore));
1169 DWORD numAttribs;
1170 _BitScanReverse(&numAttribs, state.feAttribMask);
1171 numAttribs++;
1172 new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
1173 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1174 cutPA = true;
1175 }
1176 else
1177 #endif
1178 {
1179 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1180 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
1181 cutPA = false;
1182 }
1183
1184 }
1185
1186 PA_STATE& GetPA()
1187 {
1188 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1189 if (cutPA)
1190 {
1191 return this->paCut;
1192 }
1193 else
1194 #endif
1195 {
1196 return this->paOpt;
1197 }
1198 }
1199
1200 PA_STATE_OPT paOpt;
1201 PA_STATE_CUT paCut;
1202 bool cutPA{ false };
1203
1204 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1205
1206 simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
1207 simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
1208 };