swr/rasterizer: modernize thread TLB
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief FE handler for SwrSync.
46 /// @param pContext - pointer to SWR context.
47 /// @param pDC - pointer to draw context.
48 /// @param workerId - thread's worker id. Even thread has a unique id.
49 /// @param pUserData - Pointer to user data passed back to sync callback.
50 /// @todo This should go away when we switch this to use compute threading.
51 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
52 {
53 BE_WORK work;
54 work.type = SYNC;
55 work.pfnWork = ProcessSyncBE;
56
57 MacroTileMgr* pTileMgr = pDC->pTileMgr;
58 pTileMgr->enqueue(0, 0, &work);
59 }
60
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrDestroyContext.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
67 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
68 {
69 BE_WORK work;
70 work.type = SHUTDOWN;
71 work.pfnWork = ProcessShutdownBE;
72
73 MacroTileMgr* pTileMgr = pDC->pTileMgr;
74 // Enqueue at least 1 work item for each worker thread
75 // account for number of numa nodes
76 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
77
78 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
79 {
80 for (uint32_t n = 0; n < numNumaNodes; ++n)
81 {
82 pTileMgr->enqueue(i, n, &work);
83 }
84 }
85 }
86
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief FE handler for SwrClearRenderTarget.
89 /// @param pContext - pointer to SWR context.
90 /// @param pDC - pointer to draw context.
91 /// @param workerId - thread's worker id. Even thread has a unique id.
92 /// @param pUserData - Pointer to user data passed back to clear callback.
93 /// @todo This should go away when we switch this to use compute threading.
94 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
95 {
96 CLEAR_DESC* pDesc = (CLEAR_DESC*)pUserData;
97 MacroTileMgr* pTileMgr = pDC->pTileMgr;
98
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
102 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
103 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
104 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
105
106 BE_WORK work;
107 work.type = CLEAR;
108 work.pfnWork = ProcessClearBE;
109 work.desc.clear = *pDesc;
110
111 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
112 {
113 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
114 {
115 pTileMgr->enqueue(x, y, &work);
116 }
117 }
118 }
119
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
127 void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
128 {
129 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
130 MacroTileMgr* pTileMgr = pDC->pTileMgr;
131 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
132
133 // queue a store to each macro tile
134 // compute macro tile bounds for the specified rect
135 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
136 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
137 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
138 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
139
140 // store tiles
141 BE_WORK work;
142 work.type = STORETILES;
143 work.pfnWork = ProcessStoreTilesBE;
144 work.desc.storeTiles = *pDesc;
145
146 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
147 {
148 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
149 {
150 pTileMgr->enqueue(x, y, &work);
151 }
152 }
153
154 RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
155 }
156
157 //////////////////////////////////////////////////////////////////////////
158 /// @brief FE handler for SwrInvalidateTiles.
159 /// @param pContext - pointer to SWR context.
160 /// @param pDC - pointer to draw context.
161 /// @param workerId - thread's worker id. Even thread has a unique id.
162 /// @param pUserData - Pointer to user data passed back to callback.
163 /// @todo This should go away when we switch this to use compute threading.
164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext,
165 DRAW_CONTEXT* pDC,
166 uint32_t workerId,
167 void* pUserData)
168 {
169 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
170 DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
171 MacroTileMgr* pTileMgr = pDC->pTileMgr;
172
173 // compute macro tile bounds for the specified rect
174 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
175 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
176 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
177 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
178
179 if (pDesc->fullTilesOnly == false)
180 {
181 // include partial tiles
182 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
183 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
184 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
185 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
186 }
187
188 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
189 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
190
191 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
192 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
193
194 // load tiles
195 BE_WORK work;
196 work.type = DISCARDINVALIDATETILES;
197 work.pfnWork = ProcessDiscardInvalidateTilesBE;
198 work.desc.discardInvalidateTiles = *pDesc;
199
200 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
201 {
202 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
203 {
204 pTileMgr->enqueue(x, y, &work);
205 }
206 }
207
208 RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
209 }
210
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Computes the number of primitives given the number of verts.
213 /// @param mode - primitive topology for draw operation.
214 /// @param numPrims - number of vertices or indices for draw.
215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
217 {
218 switch (mode)
219 {
220 case TOP_POINT_LIST:
221 return numPrims;
222 case TOP_TRIANGLE_LIST:
223 return numPrims / 3;
224 case TOP_TRIANGLE_STRIP:
225 return numPrims < 3 ? 0 : numPrims - 2;
226 case TOP_TRIANGLE_FAN:
227 return numPrims < 3 ? 0 : numPrims - 2;
228 case TOP_TRIANGLE_DISC:
229 return numPrims < 2 ? 0 : numPrims - 1;
230 case TOP_QUAD_LIST:
231 return numPrims / 4;
232 case TOP_QUAD_STRIP:
233 return numPrims < 4 ? 0 : (numPrims - 2) / 2;
234 case TOP_LINE_STRIP:
235 return numPrims < 2 ? 0 : numPrims - 1;
236 case TOP_LINE_LIST:
237 return numPrims / 2;
238 case TOP_LINE_LOOP:
239 return numPrims;
240 case TOP_RECT_LIST:
241 return numPrims / 3;
242 case TOP_LINE_LIST_ADJ:
243 return numPrims / 4;
244 case TOP_LISTSTRIP_ADJ:
245 return numPrims < 3 ? 0 : numPrims - 3;
246 case TOP_TRI_LIST_ADJ:
247 return numPrims / 6;
248 case TOP_TRI_STRIP_ADJ:
249 return numPrims < 4 ? 0 : (numPrims / 2) - 2;
250
251 case TOP_PATCHLIST_1:
252 case TOP_PATCHLIST_2:
253 case TOP_PATCHLIST_3:
254 case TOP_PATCHLIST_4:
255 case TOP_PATCHLIST_5:
256 case TOP_PATCHLIST_6:
257 case TOP_PATCHLIST_7:
258 case TOP_PATCHLIST_8:
259 case TOP_PATCHLIST_9:
260 case TOP_PATCHLIST_10:
261 case TOP_PATCHLIST_11:
262 case TOP_PATCHLIST_12:
263 case TOP_PATCHLIST_13:
264 case TOP_PATCHLIST_14:
265 case TOP_PATCHLIST_15:
266 case TOP_PATCHLIST_16:
267 case TOP_PATCHLIST_17:
268 case TOP_PATCHLIST_18:
269 case TOP_PATCHLIST_19:
270 case TOP_PATCHLIST_20:
271 case TOP_PATCHLIST_21:
272 case TOP_PATCHLIST_22:
273 case TOP_PATCHLIST_23:
274 case TOP_PATCHLIST_24:
275 case TOP_PATCHLIST_25:
276 case TOP_PATCHLIST_26:
277 case TOP_PATCHLIST_27:
278 case TOP_PATCHLIST_28:
279 case TOP_PATCHLIST_29:
280 case TOP_PATCHLIST_30:
281 case TOP_PATCHLIST_31:
282 case TOP_PATCHLIST_32:
283 return numPrims / (mode - TOP_PATCHLIST_BASE);
284
285 case TOP_POLYGON:
286 case TOP_POINT_LIST_BF:
287 case TOP_LINE_STRIP_CONT:
288 case TOP_LINE_STRIP_BF:
289 case TOP_LINE_STRIP_CONT_BF:
290 case TOP_TRIANGLE_FAN_NOSTIPPLE:
291 case TOP_TRI_STRIP_REVERSE:
292 case TOP_PATCHLIST_BASE:
293 case TOP_UNKNOWN:
294 SWR_INVALID("Unsupported topology: %d", mode);
295 return 0;
296 }
297
298 return 0;
299 }
300
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief Computes the number of verts given the number of primitives.
303 /// @param mode - primitive topology for draw operation.
304 /// @param numPrims - number of primitives for draw.
305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
306 {
307 switch (mode)
308 {
309 case TOP_POINT_LIST:
310 return numPrims;
311 case TOP_TRIANGLE_LIST:
312 return numPrims * 3;
313 case TOP_TRIANGLE_STRIP:
314 return numPrims ? numPrims + 2 : 0;
315 case TOP_TRIANGLE_FAN:
316 return numPrims ? numPrims + 2 : 0;
317 case TOP_TRIANGLE_DISC:
318 return numPrims ? numPrims + 1 : 0;
319 case TOP_QUAD_LIST:
320 return numPrims * 4;
321 case TOP_QUAD_STRIP:
322 return numPrims ? numPrims * 2 + 2 : 0;
323 case TOP_LINE_STRIP:
324 return numPrims ? numPrims + 1 : 0;
325 case TOP_LINE_LIST:
326 return numPrims * 2;
327 case TOP_LINE_LOOP:
328 return numPrims;
329 case TOP_RECT_LIST:
330 return numPrims * 3;
331 case TOP_LINE_LIST_ADJ:
332 return numPrims * 4;
333 case TOP_LISTSTRIP_ADJ:
334 return numPrims ? numPrims + 3 : 0;
335 case TOP_TRI_LIST_ADJ:
336 return numPrims * 6;
337 case TOP_TRI_STRIP_ADJ:
338 return numPrims ? (numPrims + 2) * 2 : 0;
339
340 case TOP_PATCHLIST_1:
341 case TOP_PATCHLIST_2:
342 case TOP_PATCHLIST_3:
343 case TOP_PATCHLIST_4:
344 case TOP_PATCHLIST_5:
345 case TOP_PATCHLIST_6:
346 case TOP_PATCHLIST_7:
347 case TOP_PATCHLIST_8:
348 case TOP_PATCHLIST_9:
349 case TOP_PATCHLIST_10:
350 case TOP_PATCHLIST_11:
351 case TOP_PATCHLIST_12:
352 case TOP_PATCHLIST_13:
353 case TOP_PATCHLIST_14:
354 case TOP_PATCHLIST_15:
355 case TOP_PATCHLIST_16:
356 case TOP_PATCHLIST_17:
357 case TOP_PATCHLIST_18:
358 case TOP_PATCHLIST_19:
359 case TOP_PATCHLIST_20:
360 case TOP_PATCHLIST_21:
361 case TOP_PATCHLIST_22:
362 case TOP_PATCHLIST_23:
363 case TOP_PATCHLIST_24:
364 case TOP_PATCHLIST_25:
365 case TOP_PATCHLIST_26:
366 case TOP_PATCHLIST_27:
367 case TOP_PATCHLIST_28:
368 case TOP_PATCHLIST_29:
369 case TOP_PATCHLIST_30:
370 case TOP_PATCHLIST_31:
371 case TOP_PATCHLIST_32:
372 return numPrims * (mode - TOP_PATCHLIST_BASE);
373
374 case TOP_POLYGON:
375 case TOP_POINT_LIST_BF:
376 case TOP_LINE_STRIP_CONT:
377 case TOP_LINE_STRIP_BF:
378 case TOP_LINE_STRIP_CONT_BF:
379 case TOP_TRIANGLE_FAN_NOSTIPPLE:
380 case TOP_TRI_STRIP_REVERSE:
381 case TOP_PATCHLIST_BASE:
382 case TOP_UNKNOWN:
383 SWR_INVALID("Unsupported topology: %d", mode);
384 return 0;
385 }
386
387 return 0;
388 }
389
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Return number of verts per primitive.
392 /// @param topology - topology
393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
395 {
396 uint32_t numVerts = 0;
397 switch (topology)
398 {
399 case TOP_POINT_LIST:
400 case TOP_POINT_LIST_BF:
401 numVerts = 1;
402 break;
403 case TOP_LINE_LIST:
404 case TOP_LINE_STRIP:
405 case TOP_LINE_LIST_ADJ:
406 case TOP_LINE_LOOP:
407 case TOP_LINE_STRIP_CONT:
408 case TOP_LINE_STRIP_BF:
409 case TOP_LISTSTRIP_ADJ:
410 numVerts = 2;
411 break;
412 case TOP_TRIANGLE_LIST:
413 case TOP_TRIANGLE_STRIP:
414 case TOP_TRIANGLE_FAN:
415 case TOP_TRI_LIST_ADJ:
416 case TOP_TRI_STRIP_ADJ:
417 case TOP_TRI_STRIP_REVERSE:
418 case TOP_RECT_LIST:
419 numVerts = 3;
420 break;
421 case TOP_QUAD_LIST:
422 case TOP_QUAD_STRIP:
423 numVerts = 4;
424 break;
425 case TOP_PATCHLIST_1:
426 case TOP_PATCHLIST_2:
427 case TOP_PATCHLIST_3:
428 case TOP_PATCHLIST_4:
429 case TOP_PATCHLIST_5:
430 case TOP_PATCHLIST_6:
431 case TOP_PATCHLIST_7:
432 case TOP_PATCHLIST_8:
433 case TOP_PATCHLIST_9:
434 case TOP_PATCHLIST_10:
435 case TOP_PATCHLIST_11:
436 case TOP_PATCHLIST_12:
437 case TOP_PATCHLIST_13:
438 case TOP_PATCHLIST_14:
439 case TOP_PATCHLIST_15:
440 case TOP_PATCHLIST_16:
441 case TOP_PATCHLIST_17:
442 case TOP_PATCHLIST_18:
443 case TOP_PATCHLIST_19:
444 case TOP_PATCHLIST_20:
445 case TOP_PATCHLIST_21:
446 case TOP_PATCHLIST_22:
447 case TOP_PATCHLIST_23:
448 case TOP_PATCHLIST_24:
449 case TOP_PATCHLIST_25:
450 case TOP_PATCHLIST_26:
451 case TOP_PATCHLIST_27:
452 case TOP_PATCHLIST_28:
453 case TOP_PATCHLIST_29:
454 case TOP_PATCHLIST_30:
455 case TOP_PATCHLIST_31:
456 case TOP_PATCHLIST_32:
457 numVerts = topology - TOP_PATCHLIST_BASE;
458 break;
459 default:
460 SWR_INVALID("Unsupported topology: %d", topology);
461 break;
462 }
463
464 if (includeAdjVerts)
465 {
466 switch (topology)
467 {
468 case TOP_LISTSTRIP_ADJ:
469 case TOP_LINE_LIST_ADJ:
470 numVerts = 4;
471 break;
472 case TOP_TRI_STRIP_ADJ:
473 case TOP_TRI_LIST_ADJ:
474 numVerts = 6;
475 break;
476 default:
477 break;
478 }
479 }
480
481 return numVerts;
482 }
483
484 //////////////////////////////////////////////////////////////////////////
485 /// @brief Generate mask from remaining work.
486 /// @param numWorkItems - Number of items being worked on by a SIMD.
487 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
488 {
489 uint32_t numActive =
490 (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
491 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
492 return _simd_castps_si(_simd_vmask_ps(mask));
493 }
494
495 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
496 {
497 uint32_t numActive =
498 (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
499 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
500 return _simd16_castps_si(_simd16_vmask_ps(mask));
501 }
502
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief StreamOut - Streams vertex data out to SO buffers.
505 /// Generally, we are only streaming out a SIMDs worth of triangles.
506 /// @param pDC - pointer to draw context.
507 /// @param workerId - thread's worker id. Even thread has a unique id.
508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
509 static void StreamOut(
510 DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
511 {
512 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
513
514 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
515
516 const API_STATE& state = GetApiState(pDC);
517 const SWR_STREAMOUT_STATE& soState = state.soState;
518
519 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
520
521 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
522 // vertex.
523 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
524
525 SWR_STREAMOUT_CONTEXT soContext = {0};
526
527 // Setup buffer state pointers.
528 for (uint32_t i = 0; i < 4; ++i)
529 {
530 soContext.pBuffer[i] = &state.soBuffer[i];
531 }
532
533 uint32_t numPrims = pa.NumPrims();
534
535 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
536 {
537 DWORD slot = 0;
538 uint64_t soMask = soState.streamMasks[streamIndex];
539
540 // Write all entries into primitive data buffer for SOS.
541 while (_BitScanForward64(&slot, soMask))
542 {
543 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
544 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
545 pa.AssembleSingle(paSlot, primIndex, attrib);
546
547 // Attribute offset is relative offset from start of vertex.
548 // Note that attributes start at slot 1 in the PA buffer. We need to write this
549 // to prim data starting at slot 0. Which is why we do (slot - 1).
550 // Also note: GL works slightly differently, and needs slot 0
551 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
552
553 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
554 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
555 {
556 uint32_t* pPrimDataAttrib =
557 pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
558
559 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
560 }
561
562 soMask &= ~(uint64_t(1) << slot);
563 }
564
565 // Update pPrimData pointer
566 soContext.pPrimData = pPrimData;
567
568 // Call SOS
569 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
570 "Trying to execute uninitialized streamout jit function.");
571 state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
572 }
573
574 // Update SO write offset. The driver provides memory for the update.
575 for (uint32_t i = 0; i < 4; ++i)
576 {
577 if (state.soBuffer[i].pWriteOffset)
578 {
579 bool nullTileAccessed = false;
580 void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(
581 GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
582 *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
583 }
584
585 if (state.soBuffer[i].soWriteEnable)
586 {
587 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
588 pDC->dynState.SoWriteOffsetDirty[i] = true;
589 }
590 }
591
592 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
593 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
594
595 RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
596 }
597
598 #if USE_SIMD16_FRONTEND
599 //////////////////////////////////////////////////////////////////////////
600 /// Is value an even number (a multiple of two)
601 ///
602 template <typename T>
603 INLINE static bool IsEven(T value)
604 {
605 return (value & 1) == 0;
606 }
607
608 //////////////////////////////////////////////////////////////////////////
609 /// Round up value to an even number (a multiple of two)
610 ///
611 template <typename T>
612 INLINE static T RoundUpEven(T value)
613 {
614 return (value + 1) & ~1;
615 }
616
617 //////////////////////////////////////////////////////////////////////////
618 /// Round down value to an even number (a multiple of two)
619 ///
620 template <typename T>
621 INLINE static T RoundDownEven(T value)
622 {
623 return value & ~1;
624 }
625
626 //////////////////////////////////////////////////////////////////////////
627 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
628 ///
629 /// vertexCount is in terms of the source simdvertexes and must be even
630 ///
631 /// attribCount will limit the vector copies to those attribs specified
632 ///
633 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
634 ///
635 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex* vertex_simd16,
636 const simdvertex* vertex,
637 uint32_t vertexCount,
638 uint32_t attribCount)
639 {
640 SWR_ASSERT(vertex);
641 SWR_ASSERT(vertex_simd16);
642 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
643
644 simd16vertex temp;
645
646 for (uint32_t i = 0; i < vertexCount; i += 2)
647 {
648 for (uint32_t j = 0; j < attribCount; j += 1)
649 {
650 for (uint32_t k = 0; k < 4; k += 1)
651 {
652 temp.attrib[j][k] =
653 _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
654
655 if ((i + 1) < vertexCount)
656 {
657 temp.attrib[j][k] =
658 _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
659 }
660 }
661 }
662
663 for (uint32_t j = 0; j < attribCount; j += 1)
664 {
665 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
666 }
667 }
668 }
669
670 #endif
671 //////////////////////////////////////////////////////////////////////////
672 /// @brief Computes number of invocations. The current index represents
673 /// the start of the SIMD. The max index represents how much work
674 /// items are remaining. If there is less then a SIMD's xmin of work
675 /// then return the remaining amount of work.
676 /// @param curIndex - The start index for the SIMD.
677 /// @param maxIndex - The last index for all work items.
678 static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
679 {
680 uint32_t remainder = (maxIndex - curIndex);
681 #if USE_SIMD16_FRONTEND
682 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
683 #else
684 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
685 #endif
686 }
687
688 //////////////////////////////////////////////////////////////////////////
689 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
690 /// The geometry shader will loop over each active streamout buffer, assembling
691 /// primitives for the downstream stages. When multistream output is enabled,
692 /// the generated stream ID buffer from the GS needs to be converted to a cut
693 /// buffer for the primitive assembler.
694 /// @param stream - stream id to generate the cut buffer for
695 /// @param pStreamIdBase - pointer to the stream ID buffer
696 /// @param numEmittedVerts - Number of total verts emitted by the GS
697 /// @param pCutBuffer - output buffer to write cuts to
698 void ProcessStreamIdBuffer(uint32_t stream,
699 uint8_t* pStreamIdBase,
700 uint32_t numEmittedVerts,
701 uint8_t* pCutBuffer)
702 {
703 SWR_ASSERT(stream < MAX_SO_STREAMS);
704
705 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
706 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
707
708 for (uint32_t b = 0; b < numOutputBytes; ++b)
709 {
710 uint8_t curInputByte = pStreamIdBase[2 * b];
711 uint8_t outByte = 0;
712 for (uint32_t i = 0; i < 4; ++i)
713 {
714 if ((curInputByte & 0x3) != stream)
715 {
716 outByte |= (1 << i);
717 }
718 curInputByte >>= 2;
719 }
720
721 curInputByte = pStreamIdBase[2 * b + 1];
722 for (uint32_t i = 0; i < 4; ++i)
723 {
724 if ((curInputByte & 0x3) != stream)
725 {
726 outByte |= (1 << (i + 4));
727 }
728 curInputByte >>= 2;
729 }
730
731 *pCutBuffer++ = outByte;
732 }
733 }
734
735 // Buffers that are allocated if GS is enabled
736 struct GsBuffers
737 {
738 uint8_t* pGsIn;
739 uint8_t* pGsOut[KNOB_SIMD_WIDTH];
740 uint8_t* pGsTransposed;
741 void* pStreamCutBuffer;
742 };
743
744 //////////////////////////////////////////////////////////////////////////
745 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
746 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
747 /// assembler
748 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
749 /// @param numVerts - Number of vertices outputted by the GS
750 /// @param numAttribs - Number of attributes per vertex
751 template <typename SIMD_T, uint32_t SimdWidth>
752 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
753 {
754 uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
755 uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
756
757 OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
758
759 for (uint32_t i = 0; i < SimdWidth; ++i)
760 {
761 gatherOffsets[i] = srcVertexStride * i;
762 }
763 auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
764
765 uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
766 uint32_t remainingVerts = numVerts;
767
768 for (uint32_t s = 0; s < numSimd; ++s)
769 {
770 uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
771 uint8_t* pDstBase = pDst + s * dstVertexStride;
772
773 // Compute mask to prevent src overflow
774 uint32_t mask = std::min(remainingVerts, SimdWidth);
775 mask = GenMask(mask);
776 auto vMask = SIMD_T::vmask_ps(mask);
777 auto viMask = SIMD_T::castps_si(vMask);
778
779 for (uint32_t a = 0; a < numAttribs; ++a)
780 {
781 auto attribGatherX = SIMD_T::mask_i32gather_ps(
782 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
783 auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
784 (const float*)(pSrcBase + sizeof(float)),
785 vGatherOffsets,
786 vMask);
787 auto attribGatherZ =
788 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
789 (const float*)(pSrcBase + sizeof(float) * 2),
790 vGatherOffsets,
791 vMask);
792 auto attribGatherW =
793 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
794 (const float*)(pSrcBase + sizeof(float) * 3),
795 vGatherOffsets,
796 vMask);
797
798 SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
799 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
800 SIMD_T::maskstore_ps(
801 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
802 SIMD_T::maskstore_ps(
803 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
804
805 pSrcBase += sizeof(float) * 4;
806 pDstBase += sizeof(Float<SIMD_T>) * 4;
807 }
808 remainingVerts -= SimdWidth;
809 }
810 }
811
812
813 //////////////////////////////////////////////////////////////////////////
814 /// @brief Implements GS stage.
815 /// @param pDC - pointer to draw context.
816 /// @param workerId - thread's worker id. Even thread has a unique id.
817 /// @param pa - The primitive assembly object.
818 /// @param pGsOut - output stream for GS
819 template <typename HasStreamOutT, typename HasRastT>
820 static void GeometryShaderStage(DRAW_CONTEXT* pDC,
821 uint32_t workerId,
822 PA_STATE& pa,
823 GsBuffers* pGsBuffers,
824 uint32_t* pSoPrimData,
825 #if USE_SIMD16_FRONTEND
826 uint32_t numPrims_simd8,
827 #endif
828 simdscalari const& primID)
829 {
830 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
831
832 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
833
834 const API_STATE& state = GetApiState(pDC);
835 const SWR_GS_STATE* pState = &state.gsState;
836 SWR_GS_CONTEXT gsContext;
837
838 static uint8_t sNullBuffer[128] = {0};
839
840 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
841 {
842 gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
843 }
844 gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
845 gsContext.PrimitiveID = primID;
846
847 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
848 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
849
850 // assemble all attributes for the input primitive
851 gsContext.inputVertStride = pState->inputVertStride;
852 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
853 {
854 uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
855 uint32_t attribSlot = pState->vertexAttribOffset + slot;
856 pa.Assemble(srcAttribSlot, attrib);
857
858 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
859 {
860 gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
861 }
862 }
863
864 // assemble position
865 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
866 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
867 {
868 gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
869 }
870
871 // record valid prims from the frontend to avoid over binning the newly generated
872 // prims from the GS
873 #if USE_SIMD16_FRONTEND
874 uint32_t numInputPrims = numPrims_simd8;
875 #else
876 uint32_t numInputPrims = pa.NumPrims();
877 #endif
878
879 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
880 {
881 gsContext.InstanceID = instance;
882 gsContext.mask = GenerateMask(numInputPrims);
883
884 // execute the geometry shader
885 state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
886 AR_EVENT(GSStats((HANDLE)&gsContext.stats));
887
888 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
889 {
890 gsContext.pStreams[i] += pState->allocationSize;
891 }
892 }
893
894 // set up new binner and state for the GS output topology
895 #if USE_SIMD16_FRONTEND
896 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
897 if (HasRastT::value)
898 {
899 switch (pState->outputTopology)
900 {
901 case TOP_RECT_LIST:
902 pfnClipFunc = ClipRectangles_simd16;
903 break;
904 case TOP_TRIANGLE_STRIP:
905 pfnClipFunc = ClipTriangles_simd16;
906 break;
907 case TOP_LINE_STRIP:
908 pfnClipFunc = ClipLines_simd16;
909 break;
910 case TOP_POINT_LIST:
911 pfnClipFunc = ClipPoints_simd16;
912 break;
913 default:
914 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
915 }
916 }
917
918 #else
919 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
920 if (HasRastT::value)
921 {
922 switch (pState->outputTopology)
923 {
924 case TOP_RECT_LIST:
925 pfnClipFunc = ClipRectangles;
926 break;
927 case TOP_TRIANGLE_STRIP:
928 pfnClipFunc = ClipTriangles;
929 break;
930 case TOP_LINE_STRIP:
931 pfnClipFunc = ClipLines;
932 break;
933 case TOP_POINT_LIST:
934 pfnClipFunc = ClipPoints;
935 break;
936 default:
937 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
938 }
939 }
940
941 #endif
942 // foreach input prim:
943 // - setup a new PA based on the emitted verts for that prim
944 // - loop over the new verts, calling PA to assemble each prim
945 uint32_t* pPrimitiveId = (uint32_t*)&primID;
946
947 uint32_t totalPrimsGenerated = 0;
948 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
949 {
950 uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
951
952 // Vertex count is either emitted by shader or static
953 uint32_t vertexCount = 0;
954 if (pState->staticVertexCount)
955 {
956 vertexCount = pState->staticVertexCount;
957 }
958 else
959 {
960 // If emitted in shader, it should be the stored in the first dword of the output buffer
961 vertexCount = *(uint32_t*)pInstanceBase;
962 }
963
964 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
965 {
966 uint32_t numEmittedVerts = vertexCount;
967 if (numEmittedVerts == 0)
968 {
969 continue;
970 }
971
972 uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
973 uint8_t* pCutBase =
974 pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
975 uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
976
977 #if USE_SIMD16_FRONTEND
978 TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
979 pVertexBaseAOS,
980 vertexCount,
981 pState->outputVertexSize);
982 #else
983 TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
984 pVertexBaseAOS,
985 vertexCount,
986 pState->outputVertexSize);
987 #endif
988
989 uint32_t numAttribs = state.feNumAttributes;
990
991 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
992 {
993 bool processCutVerts = false;
994 uint8_t* pCutBuffer = pCutBase;
995
996 // assign default stream ID, only relevant when GS is outputting a single stream
997 uint32_t streamID = 0;
998 if (pState->isSingleStream)
999 {
1000 processCutVerts = true;
1001 streamID = pState->singleStreamID;
1002 if (streamID != stream)
1003 continue;
1004 }
1005 else
1006 {
1007 // early exit if this stream is not enabled for streamout
1008 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
1009 {
1010 continue;
1011 }
1012
1013 // multi-stream output, need to translate StreamID buffer to a cut buffer
1014 ProcessStreamIdBuffer(
1015 stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
1016 pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
1017 processCutVerts = false;
1018 }
1019
1020 #if USE_SIMD16_FRONTEND
1021 PA_STATE_CUT gsPa(pDC,
1022 (uint8_t*)pGsBuffers->pGsTransposed,
1023 numEmittedVerts,
1024 pState->outputVertexSize,
1025 reinterpret_cast<simd16mask*>(pCutBuffer),
1026 numEmittedVerts,
1027 numAttribs,
1028 pState->outputTopology,
1029 processCutVerts,
1030 pa.numVertsPerPrim);
1031
1032 #else
1033 PA_STATE_CUT gsPa(pDC,
1034 (uint8_t*)pGsBuffers->pGsTransposed,
1035 numEmittedVerts,
1036 pState->outputVertexSize,
1037 pCutBuffer,
1038 numEmittedVerts,
1039 numAttribs,
1040 pState->outputTopology,
1041 processCutVerts,
1042 pa.numVertsPerPrim);
1043
1044 #endif
1045 while (gsPa.GetNextStreamOutput())
1046 {
1047 do
1048 {
1049 #if USE_SIMD16_FRONTEND
1050 simd16vector attrib_simd16[3];
1051
1052 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
1053
1054 #else
1055 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
1056
1057 #endif
1058 if (assemble)
1059 {
1060 totalPrimsGenerated += gsPa.NumPrims();
1061
1062 if (HasStreamOutT::value)
1063 {
1064 #if ENABLE_AVX512_SIMD16
1065 gsPa.useAlternateOffset = false;
1066 #endif
1067 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
1068 }
1069
1070 if (HasRastT::value && state.soState.streamToRasterizer == stream)
1071 {
1072 #if USE_SIMD16_FRONTEND
1073 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
1074
1075 // Gather data from the SVG if provided.
1076 simd16scalari vViewportIdx = SIMD16::setzero_si();
1077 simd16scalari vRtIdx = SIMD16::setzero_si();
1078 SIMD16::Vec4 svgAttrib[4];
1079
1080 if (state.backendState.readViewportArrayIndex ||
1081 state.backendState.readRenderTargetArrayIndex)
1082 {
1083 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1084 }
1085
1086 if (state.backendState.readViewportArrayIndex)
1087 {
1088 vViewportIdx =
1089 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1090 gsPa.viewportArrayActive = true;
1091 }
1092 if (state.backendState.readRenderTargetArrayIndex)
1093 {
1094 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1095 gsPa.rtArrayActive = true;
1096 }
1097
1098 {
1099 // OOB VPAI indices => forced to zero.
1100 vViewportIdx =
1101 SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1102 simd16scalari vNumViewports =
1103 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1104 simd16scalari vClearMask =
1105 SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1106 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1107
1108 gsPa.useAlternateOffset = false;
1109 pfnClipFunc(pDC,
1110 gsPa,
1111 workerId,
1112 attrib_simd16,
1113 GenMask(gsPa.NumPrims()),
1114 vPrimId,
1115 vViewportIdx,
1116 vRtIdx);
1117 }
1118 #else
1119 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1120
1121 // Gather data from the SVG if provided.
1122 simdscalari vViewportIdx = SIMD::setzero_si();
1123 simdscalari vRtIdx = SIMD::setzero_si();
1124 SIMD::Vec4 svgAttrib[4];
1125
1126 if (state.backendState.readViewportArrayIndex ||
1127 state.backendState.readRenderTargetArrayIndex)
1128 {
1129 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1130 }
1131
1132 if (state.backendState.readViewportArrayIndex)
1133 {
1134 vViewportIdx =
1135 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1136
1137 // OOB VPAI indices => forced to zero.
1138 vViewportIdx =
1139 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1140 simdscalari vNumViewports =
1141 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1142 simdscalari vClearMask =
1143 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1144 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1145 gsPa.viewportArrayActive = true;
1146 }
1147 if (state.backendState.readRenderTargetArrayIndex)
1148 {
1149 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1150 gsPa.rtArrayActive = true;
1151 }
1152
1153 pfnClipFunc(pDC,
1154 gsPa,
1155 workerId,
1156 attrib,
1157 GenMask(gsPa.NumPrims()),
1158 vPrimId,
1159 vViewportIdx,
1160 vRtIdx);
1161 #endif
1162 }
1163 }
1164 } while (gsPa.NextPrim());
1165 }
1166 }
1167 }
1168 }
1169
1170 // update GS pipeline stats
1171 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1172 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1173 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
1174 RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
1175 }
1176
1177 //////////////////////////////////////////////////////////////////////////
1178 /// @brief Allocate GS buffers
1179 /// @param pDC - pointer to draw context.
1180 /// @param state - API state
1181 /// @param ppGsOut - pointer to GS output buffer allocation
1182 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1183 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1184 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC,
1185 const API_STATE& state,
1186 uint32_t vertsPerPrim,
1187 GsBuffers* pGsBuffers)
1188 {
1189 auto pArena = pDC->pArena;
1190 SWR_ASSERT(pArena != nullptr);
1191 SWR_ASSERT(state.gsState.gsEnable);
1192
1193 const SWR_GS_STATE& gsState = state.gsState;
1194
1195 // Allocate storage for vertex inputs
1196 uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1197 pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1198
1199 // Allocate arena space to hold GS output verts
1200 const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1201
1202 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1203 {
1204 pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1205 }
1206
1207 // Allocate storage for transposed GS output
1208 uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1209 uint32_t transposedBufferSize =
1210 numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1211 pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1212
1213 // Allocate storage to hold temporary stream->cut buffer, if necessary
1214 if (state.gsState.isSingleStream)
1215 {
1216 pGsBuffers->pStreamCutBuffer = nullptr;
1217 }
1218 else
1219 {
1220 pGsBuffers->pStreamCutBuffer =
1221 (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1222 }
1223 }
1224
1225 //////////////////////////////////////////////////////////////////////////
1226 /// @brief Contains all data generated by the HS and passed to the
1227 /// tessellator and DS.
1228 struct TessellationThreadLocalData
1229 {
1230 SWR_HS_CONTEXT hsContext;
1231 void* pTxCtx;
1232 size_t tsCtxSize;
1233
1234 uint8_t* pHSOutput;
1235 size_t hsOutputAllocSize;
1236
1237 simdscalar* pDSOutput;
1238 size_t dsOutputAllocSize;
1239 };
1240
1241 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1242
1243 //////////////////////////////////////////////////////////////////////////
1244 /// @brief Allocate tessellation data for this worker thread.
1245 INLINE
1246 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1247 {
1248 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1249 if (gt_pTessellationThreadData == nullptr)
1250 {
1251 gt_pTessellationThreadData =
1252 (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1253 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1254 }
1255 }
1256
1257 //////////////////////////////////////////////////////////////////////////
1258 /// @brief Implements Tessellation Stages.
1259 /// @param pDC - pointer to draw context.
1260 /// @param workerId - thread's worker id. Even thread has a unique id.
1261 /// @param pa - The primitive assembly object.
1262 /// @param pGsOut - output stream for GS
1263 template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
1264 static void TessellationStages(DRAW_CONTEXT* pDC,
1265 uint32_t workerId,
1266 PA_STATE& pa,
1267 GsBuffers* pGsBuffers,
1268 uint32_t* pSoPrimData,
1269 #if USE_SIMD16_FRONTEND
1270 uint32_t numPrims_simd8,
1271 #endif
1272 simdscalari const& primID)
1273 {
1274 const API_STATE& state = GetApiState(pDC);
1275 const SWR_TS_STATE& tsState = state.tsState;
1276 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1277
1278 SWR_ASSERT(gt_pTessellationThreadData);
1279
1280 HANDLE tsCtx = TSInitCtx(tsState.domain,
1281 tsState.partitioning,
1282 tsState.tsOutputTopology,
1283 gt_pTessellationThreadData->pTxCtx,
1284 gt_pTessellationThreadData->tsCtxSize);
1285 if (tsCtx == nullptr)
1286 {
1287 gt_pTessellationThreadData->pTxCtx =
1288 AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1289 tsCtx = TSInitCtx(tsState.domain,
1290 tsState.partitioning,
1291 tsState.tsOutputTopology,
1292 gt_pTessellationThreadData->pTxCtx,
1293 gt_pTessellationThreadData->tsCtxSize);
1294 }
1295 SWR_ASSERT(tsCtx);
1296
1297 #if USE_SIMD16_FRONTEND
1298 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1299 if (HasRastT::value)
1300 {
1301 switch (tsState.postDSTopology)
1302 {
1303 case TOP_TRIANGLE_LIST:
1304 pfnClipFunc = ClipTriangles_simd16;
1305 break;
1306 case TOP_LINE_LIST:
1307 pfnClipFunc = ClipLines_simd16;
1308 break;
1309 case TOP_POINT_LIST:
1310 pfnClipFunc = ClipPoints_simd16;
1311 break;
1312 default:
1313 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1314 }
1315 }
1316
1317 #else
1318 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1319 if (HasRastT::value)
1320 {
1321 switch (tsState.postDSTopology)
1322 {
1323 case TOP_TRIANGLE_LIST:
1324 pfnClipFunc = ClipTriangles;
1325 break;
1326 case TOP_LINE_LIST:
1327 pfnClipFunc = ClipLines;
1328 break;
1329 case TOP_POINT_LIST:
1330 pfnClipFunc = ClipPoints;
1331 break;
1332 default:
1333 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1334 }
1335 }
1336
1337 #endif
1338 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1339 hsContext.PrimitiveID = primID;
1340 hsContext.outputSize = tsState.hsAllocationSize;
1341
1342 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1343 // Max storage for one attribute for an entire simdprimitive
1344 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1345
1346 // assemble all attributes for the input primitives
1347 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1348 {
1349 uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
1350 pa.Assemble(attribSlot, simdattrib);
1351
1352 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1353 {
1354 hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
1355 }
1356 }
1357
1358 // Allocate HS output storage
1359 uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
1360
1361 if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
1362 {
1363 AlignedFree(gt_pTessellationThreadData->pHSOutput);
1364 gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
1365 gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
1366 }
1367
1368 hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
1369
1370 #if defined(_DEBUG)
1371 //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1372 #endif
1373
1374 #if USE_SIMD16_FRONTEND
1375 uint32_t numPrims = numPrims_simd8;
1376 #else
1377 uint32_t numPrims = pa.NumPrims();
1378 #endif
1379 hsContext.mask = GenerateMask(numPrims);
1380
1381 // Run the HS
1382 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
1383 state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
1384 RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
1385
1386 UPDATE_STAT_FE(HsInvocations, numPrims);
1387 AR_EVENT(HSStats((HANDLE)&hsContext.stats));
1388
1389 const uint32_t* pPrimId = (const uint32_t*)&primID;
1390
1391 for (uint32_t p = 0; p < numPrims; ++p)
1392 {
1393 ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
1394
1395 SWR_TESSELLATION_FACTORS tessFactors;
1396 tessFactors = hsContext.pCPout[p].tessFactors;
1397
1398 // Run Tessellator
1399 SWR_TS_TESSELLATED_DATA tsData = {0};
1400 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
1401 TSTessellate(tsCtx, tessFactors, tsData);
1402 AR_EVENT(TessPrimCount(1));
1403 RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
1404
1405 if (tsData.NumPrimitives == 0)
1406 {
1407 continue;
1408 }
1409 SWR_ASSERT(tsData.NumDomainPoints);
1410
1411 // Allocate DS Output memory
1412 uint32_t requiredDSVectorInvocations =
1413 AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1414 #if USE_SIMD16_FRONTEND
1415 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
1416 tsState.dsAllocationSize; // simd8 -> simd16, padding
1417 #else
1418 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1419 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1420 #endif
1421 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1422 {
1423 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1424 gt_pTessellationThreadData->pDSOutput =
1425 (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1426 gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1427 }
1428 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1429 SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1430
1431 #if defined(_DEBUG)
1432 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1433 #endif
1434
1435 // Run Domain Shader
1436 SWR_DS_CONTEXT dsContext;
1437 dsContext.PrimitiveID = pPrimId[p];
1438 dsContext.pCpIn = pCPout;
1439 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1440 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1441 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1442 dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1443 #if USE_SIMD16_FRONTEND
1444 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1445 #else
1446 dsContext.vectorStride = requiredDSVectorInvocations;
1447 #endif
1448
1449 uint32_t dsInvocations = 0;
1450
1451 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
1452 ++dsContext.vectorOffset)
1453 {
1454 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1455
1456 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
1457 state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
1458 RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
1459
1460 AR_EVENT(DSStats((HANDLE)&dsContext.stats));
1461
1462 dsInvocations += KNOB_SIMD_WIDTH;
1463 }
1464 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1465
1466 #if USE_SIMD16_FRONTEND
1467 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1468
1469 #endif
1470 PA_TESS tessPa(
1471 pDC,
1472 #if USE_SIMD16_FRONTEND
1473 reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
1474 dsContext.vectorStride / 2, // simd8 -> simd16
1475 #else
1476 dsContext.pOutputData,
1477 dsContext.vectorStride,
1478 #endif
1479 SWR_VTX_NUM_SLOTS,
1480 tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1481 tsData.ppIndices,
1482 tsData.NumPrimitives,
1483 tsState.postDSTopology,
1484 NumVertsPerPrim(tsState.postDSTopology, false));
1485
1486 while (tessPa.HasWork())
1487 {
1488 #if USE_SIMD16_FRONTEND
1489 const uint32_t numPrims = tessPa.NumPrims();
1490 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1491 const uint32_t numPrims_hi =
1492 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1493
1494 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1495 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1496 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1497
1498 #endif
1499 if (HasGeometryShaderT::value)
1500 {
1501 #if USE_SIMD16_FRONTEND
1502 tessPa.useAlternateOffset = false;
1503 GeometryShaderStage<HasStreamOutT, HasRastT>(
1504 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1505
1506 if (numPrims_hi)
1507 {
1508 tessPa.useAlternateOffset = true;
1509 GeometryShaderStage<HasStreamOutT, HasRastT>(
1510 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1511 }
1512 #else
1513 GeometryShaderStage<HasStreamOutT, HasRastT>(
1514 pDC,
1515 workerId,
1516 tessPa,
1517 pGsBuffers,
1518 pSoPrimData,
1519 _simd_set1_epi32(dsContext.PrimitiveID));
1520 #endif
1521 }
1522 else
1523 {
1524 if (HasStreamOutT::value)
1525 {
1526 #if ENABLE_AVX512_SIMD16
1527 tessPa.useAlternateOffset = false;
1528 #endif
1529 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1530 }
1531
1532 if (HasRastT::value)
1533 {
1534 #if USE_SIMD16_FRONTEND
1535 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1536 #else
1537 simdvector prim[3]; // Only deal with triangles, lines, or points
1538 #endif
1539 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
1540 bool assemble =
1541 #if USE_SIMD16_FRONTEND
1542 tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1543 #else
1544 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1545 #endif
1546 RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
1547 SWR_ASSERT(assemble);
1548
1549 SWR_ASSERT(pfnClipFunc);
1550 #if USE_SIMD16_FRONTEND
1551 // Gather data from the SVG if provided.
1552 simd16scalari vViewportIdx = SIMD16::setzero_si();
1553 simd16scalari vRtIdx = SIMD16::setzero_si();
1554 SIMD16::Vec4 svgAttrib[4];
1555
1556 if (state.backendState.readViewportArrayIndex ||
1557 state.backendState.readRenderTargetArrayIndex)
1558 {
1559 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1560 }
1561
1562 if (state.backendState.readViewportArrayIndex)
1563 {
1564 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1565 tessPa.viewportArrayActive = true;
1566 }
1567 if (state.backendState.readRenderTargetArrayIndex)
1568 {
1569 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1570 tessPa.rtArrayActive = true;
1571 }
1572
1573
1574 {
1575 // OOB VPAI indices => forced to zero.
1576 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1577 simd16scalari vNumViewports =
1578 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1579 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1580 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1581
1582 tessPa.useAlternateOffset = false;
1583 pfnClipFunc(pDC,
1584 tessPa,
1585 workerId,
1586 prim_simd16,
1587 GenMask(numPrims),
1588 primID,
1589 vViewportIdx,
1590 vRtIdx);
1591 }
1592 #else
1593 // Gather data from the SGV if provided.
1594 simdscalari vViewportIdx = SIMD::setzero_si();
1595 simdscalari vRtIdx = SIMD::setzero_si();
1596 SIMD::Vec4 svgAttrib[4];
1597
1598 if (state.backendState.readViewportArrayIndex ||
1599 state.backendState.readRenderTargetArrayIndex)
1600 {
1601 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1602 }
1603
1604 if (state.backendState.readViewportArrayIndex)
1605 {
1606 vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1607
1608 // OOB VPAI indices => forced to zero.
1609 vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1610 simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1611 simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1612 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1613 tessPa.viewportArrayActive = true;
1614 }
1615 if (state.backendState.readRenderTargetArrayIndex)
1616 {
1617 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1618 tessPa.rtArrayActive = true;
1619 }
1620 pfnClipFunc(pDC,
1621 tessPa,
1622 workerId,
1623 prim,
1624 GenMask(tessPa.NumPrims()),
1625 _simd_set1_epi32(dsContext.PrimitiveID),
1626 vViewportIdx,
1627 vRtIdx);
1628 #endif
1629 }
1630 }
1631
1632 tessPa.NextPrim();
1633
1634 } // while (tessPa.HasWork())
1635 } // for (uint32_t p = 0; p < numPrims; ++p)
1636
1637 #if USE_SIMD16_FRONTEND
1638 if (gt_pTessellationThreadData->pDSOutput != nullptr)
1639 {
1640 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1641 gt_pTessellationThreadData->pDSOutput = nullptr;
1642 }
1643 gt_pTessellationThreadData->dsOutputAllocSize = 0;
1644
1645 #endif
1646 TSDestroyCtx(tsCtx);
1647 }
1648
1649 THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
1650 THREAD uint32_t gVertexStoreSize = 0;
1651
1652 //////////////////////////////////////////////////////////////////////////
1653 /// @brief FE handler for SwrDraw.
1654 /// @tparam IsIndexedT - Is indexed drawing enabled
1655 /// @tparam HasTessellationT - Is tessellation enabled
1656 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1657 /// @tparam HasStreamOutT - Is stream-out enabled
1658 /// @tparam HasRastT - Is rasterization enabled
1659 /// @param pContext - pointer to SWR context.
1660 /// @param pDC - pointer to draw context.
1661 /// @param workerId - thread's worker id.
1662 /// @param pUserData - Pointer to DRAW_WORK
1663 template <typename IsIndexedT,
1664 typename IsCutIndexEnabledT,
1665 typename HasTessellationT,
1666 typename HasGeometryShaderT,
1667 typename HasStreamOutT,
1668 typename HasRastT>
1669 void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
1670 {
1671 #if KNOB_ENABLE_TOSS_POINTS
1672 if (KNOB_TOSS_QUEUE_FE)
1673 {
1674 return;
1675 }
1676 #endif
1677
1678 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
1679
1680 void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1681
1682 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1683 const API_STATE& state = GetApiState(pDC);
1684
1685 uint32_t indexSize = 0;
1686 uint32_t endVertex = work.numVerts;
1687
1688 gfxptr_t xpLastRequestedIndex = 0;
1689 if (IsIndexedT::value)
1690 {
1691 switch (work.type)
1692 {
1693 case R32_UINT:
1694 indexSize = sizeof(uint32_t);
1695 break;
1696 case R16_UINT:
1697 indexSize = sizeof(uint16_t);
1698 break;
1699 case R8_UINT:
1700 indexSize = sizeof(uint8_t);
1701 break;
1702 default:
1703 SWR_INVALID("Invalid work.type: %d", work.type);
1704 }
1705 xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1706 }
1707 else
1708 {
1709 // No cuts, prune partial primitives.
1710 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1711 }
1712
1713 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1714 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1715 #endif
1716
1717 GsBuffers gsBuffers;
1718 if (HasGeometryShaderT::value)
1719 {
1720 #if USE_SIMD16_FRONTEND
1721 AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
1722 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1723 #else
1724 AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
1725 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1726 #endif
1727 }
1728
1729 if (HasTessellationT::value)
1730 {
1731 SWR_ASSERT(state.tsState.tsEnable == true);
1732 SWR_ASSERT(state.pfnHsFunc != nullptr);
1733 SWR_ASSERT(state.pfnDsFunc != nullptr);
1734
1735 AllocateTessellationData(pContext);
1736 }
1737 else
1738 {
1739 SWR_ASSERT(state.tsState.tsEnable == false);
1740 SWR_ASSERT(state.pfnHsFunc == nullptr);
1741 SWR_ASSERT(state.pfnDsFunc == nullptr);
1742 }
1743
1744 // allocate space for streamout input prim data
1745 uint32_t* pSoPrimData = nullptr;
1746 if (HasStreamOutT::value)
1747 {
1748 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1749 }
1750
1751 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1752 #if USE_SIMD16_FRONTEND
1753 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1754 #else
1755 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1756 #endif
1757
1758 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1759
1760 // Compute storage requirements for vertex store
1761 // TODO: allocation needs to be rethought for better cut support
1762 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1763 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1764
1765 // grow the vertex store for the PA as necessary
1766 if (gVertexStoreSize < vertexStoreSize)
1767 {
1768 if (gpVertexStore != nullptr)
1769 {
1770 AlignedFree(gpVertexStore);
1771 gpVertexStore = nullptr;
1772 }
1773
1774 SWR_ASSERT(gpVertexStore == nullptr);
1775
1776 gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
1777 gVertexStoreSize = vertexStoreSize;
1778
1779 SWR_ASSERT(gpVertexStore != nullptr);
1780 }
1781
1782 // choose primitive assembler
1783
1784 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
1785 state.topology,
1786 work.numVerts,
1787 gpVertexStore,
1788 numVerts,
1789 state.frontendState.vsVertexSize,
1790 GetNumVerts(state.topology, 1));
1791 PA_STATE& pa = paFactory.GetPA();
1792
1793 #if USE_SIMD16_FRONTEND
1794 #if USE_SIMD16_SHADERS
1795 simd16vertex vin;
1796 #else
1797 simdvertex vin_lo;
1798 simdvertex vin_hi;
1799 #endif
1800 SWR_VS_CONTEXT vsContext_lo;
1801 SWR_VS_CONTEXT vsContext_hi;
1802
1803 #if USE_SIMD16_SHADERS
1804 vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
1805 vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
1806 #else
1807 vsContext_lo.pVin = &vin_lo;
1808 vsContext_hi.pVin = &vin_hi;
1809 #endif
1810 vsContext_lo.AlternateOffset = 0;
1811 vsContext_hi.AlternateOffset = 1;
1812
1813 SWR_FETCH_CONTEXT fetchInfo_lo = {0};
1814
1815 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1816 fetchInfo_lo.StartInstance = work.startInstance;
1817 fetchInfo_lo.StartVertex = 0;
1818
1819 if (IsIndexedT::value)
1820 {
1821 fetchInfo_lo.BaseVertex = work.baseVertex;
1822
1823 // if the entire index buffer isn't being consumed, set the last index
1824 // so that fetches < a SIMD wide will be masked off
1825 fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1826 if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1827 {
1828 fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1829 }
1830 }
1831 else
1832 {
1833 fetchInfo_lo.StartVertex = work.startVertex;
1834 }
1835
1836 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1837
1838 const simd16scalari vScale =
1839 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1840
1841 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1842 {
1843 uint32_t i = 0;
1844
1845 simd16scalari vIndex;
1846
1847 if (IsIndexedT::value)
1848 {
1849 fetchInfo_lo.xpIndices = work.xpIB;
1850 fetchInfo_hi.xpIndices =
1851 fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1852 }
1853 else
1854 {
1855 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1856
1857 fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
1858 fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
1859 GetPrivateState(pDC),
1860 &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1861 }
1862
1863 fetchInfo_lo.CurInstance = instanceNum;
1864 fetchInfo_hi.CurInstance = instanceNum;
1865
1866 vsContext_lo.InstanceID = instanceNum;
1867 vsContext_hi.InstanceID = instanceNum;
1868
1869 while (pa.HasWork())
1870 {
1871 // GetNextVsOutput currently has the side effect of updating some PA state machine
1872 // state. So we need to keep this outside of (i < endVertex) check.
1873
1874 simdmask* pvCutIndices_lo = nullptr;
1875 simdmask* pvCutIndices_hi = nullptr;
1876
1877 if (IsIndexedT::value)
1878 {
1879 // simd16mask <=> simdmask[2]
1880
1881 pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
1882 pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
1883 }
1884
1885 simd16vertex& vout = pa.GetNextVsOutput();
1886
1887 vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
1888 vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
1889
1890 if (i < endVertex)
1891 {
1892 if (!IsIndexedT::value)
1893 {
1894 fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1895 uint32_t offset;
1896 offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
1897 offset *= 4; // convert from index to address
1898 #if USE_SIMD16_SHADERS
1899 fetchInfo_lo.xpLastIndex += offset;
1900 #else
1901 fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
1902 uint32_t offset2 =
1903 std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
1904 assert(offset >= 0);
1905 fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1906 fetchInfo_hi.xpLastIndex += offset2;
1907 #endif
1908 }
1909 // 1. Execute FS/VS for a single SIMD.
1910 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
1911 #if USE_SIMD16_SHADERS
1912 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
1913 #else
1914 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
1915
1916 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1917 {
1918 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
1919 }
1920 #endif
1921 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
1922
1923 // forward fetch generated vertex IDs to the vertex shader
1924 #if USE_SIMD16_SHADERS
1925 #if USE_SIMD16_VS
1926 vsContext_lo.VertexID16 =
1927 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1928 vsContext_lo.VertexID16 =
1929 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1930 #else
1931 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1932 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1933 #endif
1934 #else
1935 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1936 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1937 #endif
1938
1939 // Setup active mask for vertex shader.
1940 #if USE_SIMD16_VS
1941 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1942 #else
1943 vsContext_lo.mask = GenerateMask(endVertex - i);
1944 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1945 #endif
1946
1947 // forward cut mask to the PA
1948 if (IsIndexedT::value)
1949 {
1950 #if USE_SIMD16_SHADERS
1951 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1952 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1953 #else
1954 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1955 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1956 #endif
1957 }
1958
1959 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1960
1961 #if KNOB_ENABLE_TOSS_POINTS
1962 if (!KNOB_TOSS_FETCH)
1963 #endif
1964 {
1965 RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
1966 #if USE_SIMD16_VS
1967 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1968 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1969 #else
1970 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1971 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1972
1973 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1974 {
1975 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
1976 AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
1977 }
1978 #endif
1979 RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
1980
1981 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1982 }
1983 }
1984
1985 // 2. Assemble primitives given the last two SIMD.
1986 do
1987 {
1988 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1989
1990 RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
1991 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1992 RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
1993
1994 #if KNOB_ENABLE_TOSS_POINTS
1995 if (!KNOB_TOSS_FETCH)
1996 #endif
1997 {
1998 #if KNOB_ENABLE_TOSS_POINTS
1999 if (!KNOB_TOSS_VS)
2000 #endif
2001 {
2002 if (assemble)
2003 {
2004 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2005
2006 const uint32_t numPrims = pa.NumPrims();
2007 const uint32_t numPrims_lo =
2008 std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
2009 const uint32_t numPrims_hi =
2010 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
2011
2012 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
2013 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
2014 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
2015
2016 if (HasTessellationT::value)
2017 {
2018 pa.useAlternateOffset = false;
2019 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2020 pDC,
2021 workerId,
2022 pa,
2023 &gsBuffers,
2024 pSoPrimData,
2025 numPrims_lo,
2026 primID_lo);
2027
2028 if (numPrims_hi)
2029 {
2030 pa.useAlternateOffset = true;
2031 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2032 pDC,
2033 workerId,
2034 pa,
2035 &gsBuffers,
2036 pSoPrimData,
2037 numPrims_hi,
2038 primID_hi);
2039 }
2040 }
2041 else if (HasGeometryShaderT::value)
2042 {
2043 pa.useAlternateOffset = false;
2044 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2045 workerId,
2046 pa,
2047 &gsBuffers,
2048 pSoPrimData,
2049 numPrims_lo,
2050 primID_lo);
2051
2052 if (numPrims_hi)
2053 {
2054 pa.useAlternateOffset = true;
2055 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2056 workerId,
2057 pa,
2058 &gsBuffers,
2059 pSoPrimData,
2060 numPrims_hi,
2061 primID_hi);
2062 }
2063 }
2064 else
2065 {
2066 // If streamout is enabled then stream vertices out to memory.
2067 if (HasStreamOutT::value)
2068 {
2069 pa.useAlternateOffset = false;
2070 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2071 }
2072
2073 if (HasRastT::value)
2074 {
2075 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
2076 // Gather data from the SVG if provided.
2077 simd16scalari vpai = SIMD16::setzero_si();
2078 simd16scalari rtai = SIMD16::setzero_si();
2079 SIMD16::Vec4 svgAttrib[4];
2080
2081 if (state.backendState.readViewportArrayIndex ||
2082 state.backendState.readRenderTargetArrayIndex)
2083 {
2084 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2085 }
2086
2087 if (state.backendState.readViewportArrayIndex)
2088 {
2089 vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2090 pa.viewportArrayActive = true;
2091 }
2092 if (state.backendState.readRenderTargetArrayIndex)
2093 {
2094 rtai =
2095 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2096 pa.rtArrayActive = true;
2097 }
2098
2099 {
2100 // OOB VPAI indices => forced to zero.
2101 vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
2102 simd16scalari vNumViewports =
2103 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2104 simd16scalari vClearMask =
2105 SIMD16::cmplt_epi32(vpai, vNumViewports);
2106 vpai = SIMD16::and_si(vClearMask, vpai);
2107
2108 pa.useAlternateOffset = false;
2109 pDC->pState->pfnProcessPrims_simd16(pDC,
2110 pa,
2111 workerId,
2112 prim_simd16,
2113 GenMask(numPrims),
2114 primID,
2115 vpai,
2116 rtai);
2117 }
2118 }
2119 }
2120 }
2121 }
2122 }
2123 } while (pa.NextPrim());
2124
2125 if (IsIndexedT::value)
2126 {
2127 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2128 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2129 }
2130 else
2131 {
2132 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
2133 }
2134
2135 i += KNOB_SIMD16_WIDTH;
2136 }
2137
2138 pa.Reset();
2139 }
2140
2141 #else
2142 SWR_VS_CONTEXT vsContext;
2143 SWR_FETCH_CONTEXT fetchInfo = {0};
2144
2145 fetchInfo.pStreams = &state.vertexBuffers[0];
2146 fetchInfo.StartInstance = work.startInstance;
2147 fetchInfo.StartVertex = 0;
2148
2149 if (IsIndexedT::value)
2150 {
2151 fetchInfo.BaseVertex = work.baseVertex;
2152
2153 // if the entire index buffer isn't being consumed, set the last index
2154 // so that fetches < a SIMD wide will be masked off
2155 fetchInfo.pLastIndex =
2156 (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
2157 if (xpLastRequestedIndex < fetchInfo.pLastIndex)
2158 {
2159 fetchInfo.pLastIndex = xpLastRequestedIndex;
2160 }
2161 }
2162 else
2163 {
2164 fetchInfo.StartVertex = work.startVertex;
2165 }
2166
2167 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2168
2169 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2170 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
2171 {
2172 simdscalari vIndex;
2173 uint32_t i = 0;
2174
2175 if (IsIndexedT::value)
2176 {
2177 fetchInfo.pIndices = work.pIB;
2178 }
2179 else
2180 {
2181 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
2182 fetchInfo.pIndices = (const int32_t*)&vIndex;
2183 }
2184
2185 fetchInfo.CurInstance = instanceNum;
2186 vsContext.InstanceID = instanceNum;
2187
2188 while (pa.HasWork())
2189 {
2190 // GetNextVsOutput currently has the side effect of updating some PA state machine
2191 // state. So we need to keep this outside of (i < endVertex) check.
2192 simdmask* pvCutIndices = nullptr;
2193 if (IsIndexedT::value)
2194 {
2195 pvCutIndices = &pa.GetNextVsIndices();
2196 }
2197
2198 simdvertex& vout = pa.GetNextVsOutput();
2199 vsContext.pVin = &vout;
2200 vsContext.pVout = &vout;
2201
2202 if (i < endVertex)
2203 {
2204 // 1. Execute FS/VS for a single SIMD.
2205 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
2206 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
2207 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
2208
2209 // forward fetch generated vertex IDs to the vertex shader
2210 vsContext.VertexID = fetchInfo.VertexID;
2211
2212 // Setup active mask for vertex shader.
2213 vsContext.mask = GenerateMask(endVertex - i);
2214
2215 // forward cut mask to the PA
2216 if (IsIndexedT::value)
2217 {
2218 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2219 }
2220
2221 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2222
2223 #if KNOB_ENABLE_TOSS_POINTS
2224 if (!KNOB_TOSS_FETCH)
2225 #endif
2226 {
2227 RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
2228 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
2229 RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
2230
2231 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2232 AR_EVENT(VSStats((HANDLE)&vsContext.stats));
2233 }
2234 }
2235
2236 // 2. Assemble primitives given the last two SIMD.
2237 do
2238 {
2239 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2240 // PaAssemble returns false if there is not enough verts to assemble.
2241 RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
2242 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2243 RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
2244
2245 #if KNOB_ENABLE_TOSS_POINTS
2246 if (!KNOB_TOSS_FETCH)
2247 #endif
2248 {
2249 #if KNOB_ENABLE_TOSS_POINTS
2250 if (!KNOB_TOSS_VS)
2251 #endif
2252 {
2253 if (assemble)
2254 {
2255 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2256
2257 if (HasTessellationT::value)
2258 {
2259 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2260 pDC,
2261 workerId,
2262 pa,
2263 &gsBuffers,
2264 pSoPrimData,
2265 pa.GetPrimID(work.startPrimID));
2266 }
2267 else if (HasGeometryShaderT::value)
2268 {
2269 GeometryShaderStage<HasStreamOutT, HasRastT>(
2270 pDC,
2271 workerId,
2272 pa,
2273 &gsBuffers,
2274 pSoPrimData,
2275 pa.GetPrimID(work.startPrimID));
2276 }
2277 else
2278 {
2279 // If streamout is enabled then stream vertices out to memory.
2280 if (HasStreamOutT::value)
2281 {
2282 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2283 }
2284
2285 if (HasRastT::value)
2286 {
2287 SWR_ASSERT(pDC->pState->pfnProcessPrims);
2288
2289 // Gather data from the SVG if provided.
2290 simdscalari vViewportIdx = SIMD::setzero_si();
2291 simdscalari vRtIdx = SIMD::setzero_si();
2292 SIMD::Vec4 svgAttrib[4];
2293
2294 if (state.backendState.readViewportArrayIndex ||
2295 state.backendState.readRenderTargetArrayIndex)
2296 {
2297 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2298 }
2299
2300 if (state.backendState.readViewportArrayIndex)
2301 {
2302 vViewportIdx =
2303 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2304
2305 // OOB VPAI indices => forced to zero.
2306 vViewportIdx =
2307 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2308 simdscalari vNumViewports =
2309 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2310 simdscalari vClearMask =
2311 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2312 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2313 pa.viewportArrayActive = true;
2314 }
2315 if (state.backendState.readRenderTargetArrayIndex)
2316 {
2317 vRtIdx =
2318 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2319 pa.rtArrayActive = true;
2320 }
2321
2322 pDC->pState->pfnProcessPrims(pDC,
2323 pa,
2324 workerId,
2325 prim,
2326 GenMask(pa.NumPrims()),
2327 pa.GetPrimID(work.startPrimID),
2328 vViewportIdx,
2329 vRtIdx);
2330 }
2331 }
2332 }
2333 }
2334 }
2335 } while (pa.NextPrim());
2336
2337 if (IsIndexedT::value)
2338 {
2339 fetchInfo.pIndices =
2340 (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2341 }
2342 else
2343 {
2344 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2345 }
2346
2347 i += KNOB_SIMD_WIDTH;
2348 }
2349 pa.Reset();
2350 }
2351
2352 #endif
2353
2354 RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
2355 }
2356
2357 struct FEDrawChooser
2358 {
2359 typedef PFN_FE_WORK_FUNC FuncType;
2360
2361 template <typename... ArgsB>
2362 static FuncType GetFunc()
2363 {
2364 return ProcessDraw<ArgsB...>;
2365 }
2366 };
2367
2368 // Selector for correct templated Draw front-end function
2369 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
2370 bool IsCutIndexEnabled,
2371 bool HasTessellation,
2372 bool HasGeometryShader,
2373 bool HasStreamOut,
2374 bool HasRasterization)
2375 {
2376 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
2377 IsCutIndexEnabled,
2378 HasTessellation,
2379 HasGeometryShader,
2380 HasStreamOut,
2381 HasRasterization);
2382 }