swr: don't claim to allow setting layer/viewport from VS
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 DRAW_CONTEXT *pDC{ nullptr }; // draw context
38 uint8_t* pStreamBase{ nullptr }; // vertex stream
39 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
40
41 // The topology the binner will use. In some cases the FE changes the topology from the api state.
42 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
43
44 PA_STATE() {}
45 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
46 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
47
48 virtual bool HasWork() = 0;
49 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
50 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
51 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
52 virtual bool NextPrim() = 0;
53 virtual simdvertex& GetNextVsOutput() = 0;
54 virtual bool GetNextStreamOutput() = 0;
55 virtual simdmask& GetNextVsIndices() = 0;
56 virtual uint32_t NumPrims() = 0;
57 virtual void Reset() = 0;
58 virtual simdscalari GetPrimID(uint32_t startID) = 0;
59 };
60
61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
62 // output. Here is the sequence
63 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
64 // 2. Execute PA function to assemble and bin triangles.
65 // a. The PA function is a set of functions that collectively make up the
66 // state machine for a given topology.
67 // 1. We use a state index to track which PA function to call.
68 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
69 // 1. We call this the current and previous simd vertex.
70 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
71 // order to assemble the second triangle, for a triangle list, we'll need the
72 // last vertex from the previous simd and the first 2 vertices from the current simd.
73 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
74 //
75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
76 // cuts
77 struct PA_STATE_OPT : public PA_STATE
78 {
79 simdvertex leadingVertex; // For tri-fan
80 uint32_t numPrims{ 0 }; // Total number of primitives for draw.
81 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
82
83 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
84
85 uint32_t cur{ 0 }; // index to current VS output.
86 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
87 uint32_t first{ 0 }; // index to first VS output. Used for trifan.
88
89 uint32_t counter{ 0 }; // state counter
90 bool reset{ false }; // reset state
91
92 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
93 simdscalari primID;
94
95 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
96 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
97
98 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
99 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
100 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
101
102 // state used to advance the PA when Next is called
103 PFN_PA_FUNC pfnPaNextFunc{ nullptr };
104 uint32_t nextNumSimdPrims{ 0 };
105 uint32_t nextNumPrimsIncrement{ 0 };
106 bool nextReset{ false };
107 bool isStreaming{ false };
108
109 simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function
110
111 PA_STATE_OPT() {}
112 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
113 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
114
115 bool HasWork()
116 {
117 return (this->numPrimsComplete < this->numPrims) ? true : false;
118 }
119
120 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
121 {
122 simdvertex* pVertex = (simdvertex*)pStreamBase;
123 return pVertex[index].attrib[slot];
124 }
125
126 // Assembles 4 triangles. Each simdvector is a single vertex from 4
127 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
128 bool Assemble(uint32_t slot, simdvector verts[])
129 {
130 return this->pfnPaFunc(*this, slot, verts);
131 }
132
133 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
134 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
135 {
136 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
137 }
138
139 bool NextPrim()
140 {
141 this->pfnPaFunc = this->pfnPaNextFunc;
142 this->numSimdPrims = this->nextNumSimdPrims;
143 this->numPrimsComplete += this->nextNumPrimsIncrement;
144 this->reset = this->nextReset;
145
146 if (this->isStreaming)
147 {
148 this->reset = false;
149 }
150
151 bool morePrims = false;
152
153 if (this->numSimdPrims > 0)
154 {
155 morePrims = true;
156 this->numSimdPrims--;
157 }
158 else
159 {
160 this->counter = (this->reset) ? 0 : (this->counter + 1);
161 this->reset = false;
162 }
163
164 this->pfnPaFunc = this->pfnPaNextFunc;
165
166 if (!HasWork())
167 {
168 morePrims = false; // no more to do
169 }
170
171 return morePrims;
172 }
173
174 simdvertex& GetNextVsOutput()
175 {
176 // increment cur and prev indices
177 const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
178 this->prev = this->cur; // prev is undefined for first state.
179 this->cur = this->counter % numSimdVerts;
180
181 simdvertex* pVertex = (simdvertex*)pStreamBase;
182 return pVertex[this->cur];
183 }
184
185 simdmask& GetNextVsIndices()
186 {
187 // unused in optimized PA, pass tmp buffer back
188 return tmpIndices;
189 }
190
191 bool GetNextStreamOutput()
192 {
193 this->prev = this->cur;
194 this->cur = this->counter;
195
196 return HasWork();
197 }
198
199 uint32_t NumPrims()
200 {
201 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
202 (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
203 }
204
205 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
206 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
207 uint32_t numSimdPrims = 0,
208 uint32_t numPrimsIncrement = 0,
209 bool reset = false)
210 {
211 this->pfnPaNextFunc = pfnPaNextFunc;
212 this->nextNumSimdPrims = numSimdPrims;
213 this->nextNumPrimsIncrement = numPrimsIncrement;
214 this->nextReset = reset;
215
216 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
217 }
218
219 void Reset()
220 {
221 this->pfnPaFunc = this->pfnPaFuncReset;
222 this->numPrimsComplete = 0;
223 this->numSimdPrims = 0;
224 this->cur = 0;
225 this->prev = 0;
226 this->first = 0;
227 this->counter = 0;
228 this->reset = false;
229 }
230
231 simdscalari GetPrimID(uint32_t startID)
232 {
233 return _simd_add_epi32(this->primID,
234 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
235 }
236 };
237
238 // helper C wrappers to avoid having to rewrite all the PA topology state functions
239 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
240 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
241 uint32_t numSimdPrims = 0,
242 uint32_t numPrimsIncrement = 0,
243 bool reset = false)
244 {
245 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
246 }
247 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
248 {
249 return pa.GetSimdVector(index, slot);
250 }
251
252 INLINE __m128 swizzleLane0(const simdvector &a)
253 {
254 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
255 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
256 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
257 }
258
259 INLINE __m128 swizzleLane1(const simdvector &a)
260 {
261 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
262 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
263 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
264 }
265
266 INLINE __m128 swizzleLane2(const simdvector &a)
267 {
268 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
269 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
270 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
271 }
272
273 INLINE __m128 swizzleLane3(const simdvector &a)
274 {
275 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
276 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
277 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
278 }
279
280 INLINE __m128 swizzleLane4(const simdvector &a)
281 {
282 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
283 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
284 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
285
286 }
287
288 INLINE __m128 swizzleLane5(const simdvector &a)
289 {
290 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
291 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
292 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
293 }
294
295 INLINE __m128 swizzleLane6(const simdvector &a)
296 {
297 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
298 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
299 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
300 }
301
302 INLINE __m128 swizzleLane7(const simdvector &a)
303 {
304 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
305 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
306 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
307 }
308
309 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
310 {
311 switch (lane) {
312 case 0:
313 return swizzleLane0(a);
314 case 1:
315 return swizzleLane1(a);
316 case 2:
317 return swizzleLane2(a);
318 case 3:
319 return swizzleLane3(a);
320 case 4:
321 return swizzleLane4(a);
322 case 5:
323 return swizzleLane5(a);
324 case 6:
325 return swizzleLane6(a);
326 case 7:
327 return swizzleLane7(a);
328 default:
329 return _mm_setzero_ps();
330 }
331 }
332
333 // Cut-aware primitive assembler.
334 struct PA_STATE_CUT : public PA_STATE
335 {
336 simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
337 uint32_t numVerts{ 0 }; // number of vertices available in buffer store
338 uint32_t numAttribs{ 0 }; // number of attributes
339 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
340 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
341 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather
342 simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
343 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
344 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
345 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
346 uint32_t curVertex{ 0 }; // current unprocessed vertex
347 uint32_t startPrimId{ 0 }; // starting prim id
348 simdscalari vPrimId; // vector of prim ID
349 bool needOffsets{ false }; // need to compute gather offsets for current SIMD
350 uint32_t vertsPerPrim{ 0 };
351 simdvertex tmpVertex; // temporary simdvertex for unimplemented API
352 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
353 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
354 // while the GS sends valid verts for every index
355 // Topology state tracking
356 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
357 uint32_t curIndex{ 0 };
358 bool reverseWinding{ false }; // indicates reverse winding for strips
359 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
360
361 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
362 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
363
364 PA_STATE_CUT() {}
365 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
366 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
367 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
368 {
369 numVerts = in_streamSizeInVerts;
370 numAttribs = in_numAttribs;
371 binTopology = topo;
372 needOffsets = false;
373 processCutVerts = in_processCutVerts;
374
375 numVertsToAssemble = numRemainingVerts = in_numVerts;
376 numPrimsAssembled = 0;
377 headVertex = tailVertex = curVertex = 0;
378
379 curIndex = 0;
380 pCutIndices = in_pIndices;
381 memset(indices, 0, sizeof(indices));
382 vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
383 reverseWinding = false;
384 adjExtraVert = -1;
385
386 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
387 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
388
389 switch (topo)
390 {
391 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
392 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
393 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
394 case TOP_TRI_STRIP_ADJ: if (gsEnabled)
395 {
396 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
397 }
398 else
399 {
400 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
401 }
402 break;
403
404 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
405 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
406 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
407 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
408 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
409 default: assert(0 && "Unimplemented topology");
410 }
411 }
412
413 simdvertex& GetNextVsOutput()
414 {
415 uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
416 this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
417 this->needOffsets = true;
418 return ((simdvertex*)pStreamBase)[vertexIndex];
419 }
420
421 simdmask& GetNextVsIndices()
422 {
423 uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
424 simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
425 return *pCurCutIndex;
426 }
427
428 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
429 {
430 // unused
431 SWR_ASSERT(0 && "Not implemented");
432 return this->tmpVertex.attrib[0];
433 }
434
435 bool GetNextStreamOutput()
436 {
437 this->headVertex += KNOB_SIMD_WIDTH;
438 this->needOffsets = true;
439 return HasWork();
440 }
441
442 simdscalari GetPrimID(uint32_t startID)
443 {
444 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
445 }
446
447 void Reset()
448 {
449 this->numRemainingVerts = this->numVertsToAssemble;
450 this->numPrimsAssembled = 0;
451 this->curIndex = 0;
452 this->curVertex = 0;
453 this->tailVertex = 0;
454 this->headVertex = 0;
455 this->reverseWinding = false;
456 this->adjExtraVert = -1;
457 this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
458 }
459
460 bool HasWork()
461 {
462 return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
463 }
464
465 bool IsVertexStoreFull()
466 {
467 return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
468 }
469
470 void RestartTopology()
471 {
472 this->curIndex = 0;
473 this->reverseWinding = false;
474 this->adjExtraVert = -1;
475 }
476
477 bool IsCutIndex(uint32_t vertex)
478 {
479 uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
480 uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
481 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
482 }
483
484 // iterates across the unprocessed verts until we hit the end or we
485 // have assembled SIMD prims
486 void ProcessVerts()
487 {
488 while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
489 this->numRemainingVerts > 0 &&
490 this->curVertex != this->headVertex)
491 {
492 // if cut index, restart topology
493 if (IsCutIndex(this->curVertex))
494 {
495 if (this->processCutVerts)
496 {
497 (this->*pfnPa)(this->curVertex, false);
498 }
499 // finish off tri strip w/ adj before restarting topo
500 if (this->adjExtraVert != -1)
501 {
502 (this->*pfnPa)(this->curVertex, true);
503 }
504 RestartTopology();
505 }
506 else
507 {
508 (this->*pfnPa)(this->curVertex, false);
509 }
510
511 this->curVertex++;
512 if (this->curVertex >= this->numVerts) {
513 this->curVertex = 0;
514 }
515 this->numRemainingVerts--;
516 }
517
518 // special case last primitive for tri strip w/ adj
519 if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
520 {
521 (this->*pfnPa)(this->curVertex, true);
522 }
523 }
524
525 void Advance()
526 {
527 // done with current batch
528 // advance tail to the current unsubmitted vertex
529 this->tailVertex = this->curVertex;
530 this->numPrimsAssembled = 0;
531 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
532 }
533
534 bool NextPrim()
535 {
536 // if we've assembled enough prims, we can advance to the next set of verts
537 if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
538 {
539 Advance();
540 }
541 return false;
542 }
543
544 void ComputeOffsets()
545 {
546 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
547 {
548 simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
549
550 // step to simdvertex batch
551 const uint32_t simdShift = 3; // @todo make knob
552 simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
553 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
554
555 // step to index
556 const uint32_t simdMask = 0x7; // @todo make knob
557 simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
558 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
559 }
560 }
561
562 bool Assemble(uint32_t slot, simdvector result[])
563 {
564 // process any outstanding verts
565 ProcessVerts();
566
567 // return false if we don't have enough prims assembled
568 if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
569 {
570 return false;
571 }
572
573 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
574 if (this->needOffsets)
575 {
576 ComputeOffsets();
577 this->needOffsets = false;
578 }
579
580 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
581 {
582 simdscalari offsets = this->vOffsets[v];
583
584 // step to attribute
585 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
586
587 float* pBase = (float*)this->pStreamBase;
588 for (uint32_t c = 0; c < 4; ++c)
589 {
590 result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
591
592 // move base to next component
593 pBase += KNOB_SIMD_WIDTH;
594 }
595 }
596
597 return true;
598 }
599
600 void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
601 {
602 // move to slot
603 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
604 {
605 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
606 uint32_t offset = pOffset[triIndex];
607 offset += sizeof(simdvector) * slot;
608 float* pVert = (float*)&tri[v];
609 for (uint32_t c = 0; c < 4; ++c)
610 {
611 float* pComponent = (float*)(this->pStreamBase + offset);
612 pVert[c] = *pComponent;
613 offset += KNOB_SIMD_WIDTH * sizeof(float);
614 }
615 }
616 }
617
618 uint32_t NumPrims()
619 {
620 return this->numPrimsAssembled;
621 }
622
623 // Per-topology functions
624 void ProcessVertTriStrip(uint32_t index, bool finish)
625 {
626 this->vert[this->curIndex] = index;
627 this->curIndex++;
628 if (this->curIndex == 3)
629 {
630 // assembled enough verts for prim, add to gather indices
631 this->indices[0][this->numPrimsAssembled] = this->vert[0];
632 if (reverseWinding)
633 {
634 this->indices[1][this->numPrimsAssembled] = this->vert[2];
635 this->indices[2][this->numPrimsAssembled] = this->vert[1];
636 }
637 else
638 {
639 this->indices[1][this->numPrimsAssembled] = this->vert[1];
640 this->indices[2][this->numPrimsAssembled] = this->vert[2];
641 }
642
643 // increment numPrimsAssembled
644 this->numPrimsAssembled++;
645
646 // set up next prim state
647 this->vert[0] = this->vert[1];
648 this->vert[1] = this->vert[2];
649 this->curIndex = 2;
650 this->reverseWinding ^= 1;
651 }
652 }
653
654 template<bool gsEnabled>
655 void AssembleTriStripAdj()
656 {
657 if (!gsEnabled)
658 {
659 this->vert[1] = this->vert[2];
660 this->vert[2] = this->vert[4];
661
662 this->indices[0][this->numPrimsAssembled] = this->vert[0];
663 this->indices[1][this->numPrimsAssembled] = this->vert[1];
664 this->indices[2][this->numPrimsAssembled] = this->vert[2];
665
666 this->vert[4] = this->vert[2];
667 this->vert[2] = this->vert[1];
668 }
669 else
670 {
671 this->indices[0][this->numPrimsAssembled] = this->vert[0];
672 this->indices[1][this->numPrimsAssembled] = this->vert[1];
673 this->indices[2][this->numPrimsAssembled] = this->vert[2];
674 this->indices[3][this->numPrimsAssembled] = this->vert[3];
675 this->indices[4][this->numPrimsAssembled] = this->vert[4];
676 this->indices[5][this->numPrimsAssembled] = this->vert[5];
677 }
678 this->numPrimsAssembled++;
679 }
680
681
682 template<bool gsEnabled>
683 void ProcessVertTriStripAdj(uint32_t index, bool finish)
684 {
685 // handle last primitive of tristrip
686 if (finish && this->adjExtraVert != -1)
687 {
688 this->vert[3] = this->adjExtraVert;
689 AssembleTriStripAdj<gsEnabled>();
690 this->adjExtraVert = -1;
691 return;
692 }
693
694 switch (this->curIndex)
695 {
696 case 0:
697 case 1:
698 case 2:
699 case 4:
700 this->vert[this->curIndex] = index;
701 this->curIndex++;
702 break;
703 case 3:
704 this->vert[5] = index;
705 this->curIndex++;
706 break;
707 case 5:
708 if (this->adjExtraVert == -1)
709 {
710 this->adjExtraVert = index;
711 }
712 else
713 {
714 this->vert[3] = index;
715 if (!gsEnabled)
716 {
717 AssembleTriStripAdj<gsEnabled>();
718
719 uint32_t nextTri[6];
720 if (this->reverseWinding)
721 {
722 nextTri[0] = this->vert[4];
723 nextTri[1] = this->vert[0];
724 nextTri[2] = this->vert[2];
725 nextTri[4] = this->vert[3];
726 nextTri[5] = this->adjExtraVert;
727 }
728 else
729 {
730 nextTri[0] = this->vert[2];
731 nextTri[1] = this->adjExtraVert;
732 nextTri[2] = this->vert[3];
733 nextTri[4] = this->vert[4];
734 nextTri[5] = this->vert[0];
735 }
736 for (uint32_t i = 0; i < 6; ++i)
737 {
738 this->vert[i] = nextTri[i];
739 }
740
741 this->adjExtraVert = -1;
742 this->reverseWinding ^= 1;
743 }
744 else
745 {
746 this->curIndex++;
747 }
748 }
749 break;
750 case 6:
751 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
752 AssembleTriStripAdj<gsEnabled>();
753
754 uint32_t nextTri[6];
755 if (this->reverseWinding)
756 {
757 nextTri[0] = this->vert[4];
758 nextTri[1] = this->vert[0];
759 nextTri[2] = this->vert[2];
760 nextTri[4] = this->vert[3];
761 nextTri[5] = this->adjExtraVert;
762 }
763 else
764 {
765 nextTri[0] = this->vert[2];
766 nextTri[1] = this->adjExtraVert;
767 nextTri[2] = this->vert[3];
768 nextTri[4] = this->vert[4];
769 nextTri[5] = this->vert[0];
770 }
771 for (uint32_t i = 0; i < 6; ++i)
772 {
773 this->vert[i] = nextTri[i];
774 }
775 this->reverseWinding ^= 1;
776 this->adjExtraVert = index;
777 this->curIndex--;
778 break;
779 }
780 }
781
782 void ProcessVertTriList(uint32_t index, bool finish)
783 {
784 this->vert[this->curIndex] = index;
785 this->curIndex++;
786 if (this->curIndex == 3)
787 {
788 // assembled enough verts for prim, add to gather indices
789 this->indices[0][this->numPrimsAssembled] = this->vert[0];
790 this->indices[1][this->numPrimsAssembled] = this->vert[1];
791 this->indices[2][this->numPrimsAssembled] = this->vert[2];
792
793 // increment numPrimsAssembled
794 this->numPrimsAssembled++;
795
796 // set up next prim state
797 this->curIndex = 0;
798 }
799 }
800
801 void ProcessVertTriListAdj(uint32_t index, bool finish)
802 {
803 this->vert[this->curIndex] = index;
804 this->curIndex++;
805 if (this->curIndex == 6)
806 {
807 // assembled enough verts for prim, add to gather indices
808 this->indices[0][this->numPrimsAssembled] = this->vert[0];
809 this->indices[1][this->numPrimsAssembled] = this->vert[1];
810 this->indices[2][this->numPrimsAssembled] = this->vert[2];
811 this->indices[3][this->numPrimsAssembled] = this->vert[3];
812 this->indices[4][this->numPrimsAssembled] = this->vert[4];
813 this->indices[5][this->numPrimsAssembled] = this->vert[5];
814
815 // increment numPrimsAssembled
816 this->numPrimsAssembled++;
817
818 // set up next prim state
819 this->curIndex = 0;
820 }
821 }
822
823 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
824 {
825 this->vert[this->curIndex] = index;
826 this->curIndex++;
827 if (this->curIndex == 6)
828 {
829 // assembled enough verts for prim, add to gather indices
830 this->indices[0][this->numPrimsAssembled] = this->vert[0];
831 this->indices[1][this->numPrimsAssembled] = this->vert[2];
832 this->indices[2][this->numPrimsAssembled] = this->vert[4];
833
834 // increment numPrimsAssembled
835 this->numPrimsAssembled++;
836
837 // set up next prim state
838 this->curIndex = 0;
839 }
840 }
841
842
843 void ProcessVertLineList(uint32_t index, bool finish)
844 {
845 this->vert[this->curIndex] = index;
846 this->curIndex++;
847 if (this->curIndex == 2)
848 {
849 this->indices[0][this->numPrimsAssembled] = this->vert[0];
850 this->indices[1][this->numPrimsAssembled] = this->vert[1];
851
852 this->numPrimsAssembled++;
853 this->curIndex = 0;
854 }
855 }
856
857 void ProcessVertLineStrip(uint32_t index, bool finish)
858 {
859 this->vert[this->curIndex] = index;
860 this->curIndex++;
861 if (this->curIndex == 2)
862 {
863 // assembled enough verts for prim, add to gather indices
864 this->indices[0][this->numPrimsAssembled] = this->vert[0];
865 this->indices[1][this->numPrimsAssembled] = this->vert[1];
866
867 // increment numPrimsAssembled
868 this->numPrimsAssembled++;
869
870 // set up next prim state
871 this->vert[0] = this->vert[1];
872 this->curIndex = 1;
873 }
874 }
875
876 void ProcessVertLineStripAdj(uint32_t index, bool finish)
877 {
878 this->vert[this->curIndex] = index;
879 this->curIndex++;
880 if (this->curIndex == 4)
881 {
882 // assembled enough verts for prim, add to gather indices
883 this->indices[0][this->numPrimsAssembled] = this->vert[0];
884 this->indices[1][this->numPrimsAssembled] = this->vert[1];
885 this->indices[2][this->numPrimsAssembled] = this->vert[2];
886 this->indices[3][this->numPrimsAssembled] = this->vert[3];
887
888 // increment numPrimsAssembled
889 this->numPrimsAssembled++;
890
891 // set up next prim state
892 this->vert[0] = this->vert[1];
893 this->vert[1] = this->vert[2];
894 this->vert[2] = this->vert[3];
895 this->curIndex = 3;
896 }
897 }
898
899 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
900 {
901 this->vert[this->curIndex] = index;
902 this->curIndex++;
903 if (this->curIndex == 4)
904 {
905 // assembled enough verts for prim, add to gather indices
906 this->indices[0][this->numPrimsAssembled] = this->vert[1];
907 this->indices[1][this->numPrimsAssembled] = this->vert[2];
908
909 // increment numPrimsAssembled
910 this->numPrimsAssembled++;
911
912 // set up next prim state
913 this->vert[0] = this->vert[1];
914 this->vert[1] = this->vert[2];
915 this->vert[2] = this->vert[3];
916 this->curIndex = 3;
917 }
918 }
919
920 void ProcessVertLineListAdj(uint32_t index, bool finish)
921 {
922 this->vert[this->curIndex] = index;
923 this->curIndex++;
924 if (this->curIndex == 4)
925 {
926 this->indices[0][this->numPrimsAssembled] = this->vert[0];
927 this->indices[1][this->numPrimsAssembled] = this->vert[1];
928 this->indices[2][this->numPrimsAssembled] = this->vert[2];
929 this->indices[3][this->numPrimsAssembled] = this->vert[3];
930
931 this->numPrimsAssembled++;
932 this->curIndex = 0;
933 }
934 }
935
936 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
937 {
938 this->vert[this->curIndex] = index;
939 this->curIndex++;
940 if (this->curIndex == 4)
941 {
942 this->indices[0][this->numPrimsAssembled] = this->vert[1];
943 this->indices[1][this->numPrimsAssembled] = this->vert[2];
944
945 this->numPrimsAssembled++;
946 this->curIndex = 0;
947 }
948 }
949
950 void ProcessVertPointList(uint32_t index, bool finish)
951 {
952 this->vert[this->curIndex] = index;
953 this->curIndex++;
954 if (this->curIndex == 1)
955 {
956 this->indices[0][this->numPrimsAssembled] = this->vert[0];
957 this->numPrimsAssembled++;
958 this->curIndex = 0;
959 }
960 }
961 };
962
963 // Primitive Assembly for data output from the DomainShader.
964 struct PA_TESS : PA_STATE
965 {
966 PA_TESS(
967 DRAW_CONTEXT *in_pDC,
968 const simdscalar* in_pVertData,
969 uint32_t in_attributeStrideInVectors,
970 uint32_t in_numAttributes,
971 uint32_t* (&in_ppIndices)[3],
972 uint32_t in_numPrims,
973 PRIMITIVE_TOPOLOGY in_binTopology) :
974
975 PA_STATE(in_pDC, nullptr, 0),
976 m_pVertexData(in_pVertData),
977 m_attributeStrideInVectors(in_attributeStrideInVectors),
978 m_numAttributes(in_numAttributes),
979 m_numPrims(in_numPrims)
980 {
981 m_vPrimId = _simd_setzero_si();
982 binTopology = in_binTopology;
983 m_ppIndices[0] = in_ppIndices[0];
984 m_ppIndices[1] = in_ppIndices[1];
985 m_ppIndices[2] = in_ppIndices[2];
986
987 switch (binTopology)
988 {
989 case TOP_POINT_LIST:
990 m_numVertsPerPrim = 1;
991 break;
992
993 case TOP_LINE_LIST:
994 m_numVertsPerPrim = 2;
995 break;
996
997 case TOP_TRIANGLE_LIST:
998 m_numVertsPerPrim = 3;
999 break;
1000
1001 default:
1002 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1003 break;
1004 }
1005 }
1006
1007 bool HasWork()
1008 {
1009 return m_numPrims != 0;
1010 }
1011
1012 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1013 {
1014 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1015 static simdvector junk;
1016 return junk;
1017 }
1018
1019 static simdscalari GenPrimMask(uint32_t numPrims)
1020 {
1021 SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
1022 #if KNOB_SIMD_WIDTH == 8
1023 static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1024 {
1025 -1, -1, -1, -1, -1, -1, -1, -1,
1026 0, 0, 0, 0, 0, 0, 0, 0
1027 };
1028 #elif KNOB_SIMD_WIDTH == 16
1029 static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1030 {
1031 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1033 };
1034 #else
1035 #error "Help, help, I can't get up!"
1036 #endif
1037
1038 return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
1039 }
1040
1041 bool Assemble(uint32_t slot, simdvector verts[])
1042 {
1043 static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
1044 SWR_ASSERT(slot < m_numAttributes);
1045
1046 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1047 if (0 == numPrimsToAssemble)
1048 {
1049 return false;
1050 }
1051
1052 simdscalari mask = GenPrimMask(numPrimsToAssemble);
1053
1054 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1055 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1056 {
1057 simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
1058
1059 const float* pBase = pBaseAttrib;
1060 for (uint32_t c = 0; c < 4; ++c)
1061 {
1062 verts[i].v[c] = _simd_mask_i32gather_ps(
1063 _simd_setzero_ps(),
1064 pBase,
1065 indices,
1066 _simd_castsi_ps(mask),
1067 4 /* gcc doesn't like sizeof(float) */);
1068 pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1069 }
1070 }
1071
1072 return true;
1073 }
1074
1075 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1076 {
1077 SWR_ASSERT(slot < m_numAttributes);
1078 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1079
1080 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1081 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1082 {
1083 uint32_t index = m_ppIndices[i][primIndex];
1084 const float* pVertData = pVertDataBase;
1085 float* pVert = (float*)&verts[i];
1086
1087 for (uint32_t c = 0; c < 4; ++c)
1088 {
1089 pVert[c] = pVertData[index];
1090 pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1091 }
1092 }
1093 }
1094
1095 bool NextPrim()
1096 {
1097 uint32_t numPrims = PA_TESS::NumPrims();
1098 m_numPrims -= numPrims;
1099 m_ppIndices[0] += numPrims;
1100 m_ppIndices[1] += numPrims;
1101 m_ppIndices[2] += numPrims;
1102
1103 return HasWork();
1104 }
1105
1106 simdvertex& GetNextVsOutput()
1107 {
1108 SWR_ASSERT(0, "%s", __FUNCTION__);
1109 static simdvertex junk;
1110 return junk;
1111 }
1112
1113 bool GetNextStreamOutput()
1114 {
1115 SWR_ASSERT(0, "%s", __FUNCTION__);
1116 return false;
1117 }
1118
1119 simdmask& GetNextVsIndices()
1120 {
1121 SWR_ASSERT(0, "%s", __FUNCTION__);
1122 static simdmask junk;
1123 return junk;
1124 }
1125
1126 uint32_t NumPrims()
1127 {
1128 return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
1129 }
1130
1131 void Reset() { SWR_ASSERT(0); };
1132
1133 simdscalari GetPrimID(uint32_t startID)
1134 {
1135 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1136 }
1137
1138 private:
1139 const simdscalar* m_pVertexData = nullptr;
1140 uint32_t m_attributeStrideInVectors = 0;
1141 uint32_t m_numAttributes = 0;
1142 uint32_t m_numPrims = 0;
1143 uint32_t* m_ppIndices[3];
1144
1145 uint32_t m_numVertsPerPrim = 0;
1146
1147 simdscalari m_vPrimId;
1148 };
1149
1150 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1151 // based on state.
1152 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1153 struct PA_FACTORY
1154 {
1155 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1156 {
1157 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1158 const API_STATE& state = GetApiState(pDC);
1159 if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1160 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1161 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1162 topo == TOP_TRIANGLE_LIST)) ||
1163
1164 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1165 // for them in the optimized PA
1166 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1167 {
1168 memset(&indexStore, 0, sizeof(indexStore));
1169 uint32_t numAttribs = state.feNumAttributes;
1170
1171 new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
1172 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1173 cutPA = true;
1174 }
1175 else
1176 #endif
1177 {
1178 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1179 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
1180 cutPA = false;
1181 }
1182
1183 }
1184
1185 PA_STATE& GetPA()
1186 {
1187 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1188 if (cutPA)
1189 {
1190 return this->paCut;
1191 }
1192 else
1193 #endif
1194 {
1195 return this->paOpt;
1196 }
1197 }
1198
1199 PA_STATE_OPT paOpt;
1200 PA_STATE_CUT paCut;
1201 bool cutPA{ false };
1202
1203 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1204
1205 simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
1206 simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
1207 };