swr/rast: simdlib cleanup, clipper stack space fixes
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
46 static INLINE uint32_t GenMask(uint32_t numBits)
47 {
48 SWR_ASSERT(
49 numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief FE handler for SwrSync.
55 /// @param pContext - pointer to SWR context.
56 /// @param pDC - pointer to draw context.
57 /// @param workerId - thread's worker id. Even thread has a unique id.
58 /// @param pUserData - Pointer to user data passed back to sync callback.
59 /// @todo This should go away when we switch this to use compute threading.
60 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
61 {
62 BE_WORK work;
63 work.type = SYNC;
64 work.pfnWork = ProcessSyncBE;
65
66 MacroTileMgr* pTileMgr = pDC->pTileMgr;
67 pTileMgr->enqueue(0, 0, &work);
68 }
69
70 //////////////////////////////////////////////////////////////////////////
71 /// @brief FE handler for SwrDestroyContext.
72 /// @param pContext - pointer to SWR context.
73 /// @param pDC - pointer to draw context.
74 /// @param workerId - thread's worker id. Even thread has a unique id.
75 /// @param pUserData - Pointer to user data passed back to sync callback.
76 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
77 {
78 BE_WORK work;
79 work.type = SHUTDOWN;
80 work.pfnWork = ProcessShutdownBE;
81
82 MacroTileMgr* pTileMgr = pDC->pTileMgr;
83 // Enqueue at least 1 work item for each worker thread
84 // account for number of numa nodes
85 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
86
87 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
88 {
89 for (uint32_t n = 0; n < numNumaNodes; ++n)
90 {
91 pTileMgr->enqueue(i, n, &work);
92 }
93 }
94 }
95
96 //////////////////////////////////////////////////////////////////////////
97 /// @brief FE handler for SwrClearRenderTarget.
98 /// @param pContext - pointer to SWR context.
99 /// @param pDC - pointer to draw context.
100 /// @param workerId - thread's worker id. Even thread has a unique id.
101 /// @param pUserData - Pointer to user data passed back to clear callback.
102 /// @todo This should go away when we switch this to use compute threading.
103 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
104 {
105 CLEAR_DESC* pDesc = (CLEAR_DESC*)pUserData;
106 MacroTileMgr* pTileMgr = pDC->pTileMgr;
107
108 // queue a clear to each macro tile
109 // compute macro tile bounds for the specified rect
110 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
111 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
112 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
113 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
114
115 BE_WORK work;
116 work.type = CLEAR;
117 work.pfnWork = ProcessClearBE;
118 work.desc.clear = *pDesc;
119
120 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
121 {
122 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
123 {
124 pTileMgr->enqueue(x, y, &work);
125 }
126 }
127 }
128
129 //////////////////////////////////////////////////////////////////////////
130 /// @brief FE handler for SwrStoreTiles.
131 /// @param pContext - pointer to SWR context.
132 /// @param pDC - pointer to draw context.
133 /// @param workerId - thread's worker id. Even thread has a unique id.
134 /// @param pUserData - Pointer to user data passed back to callback.
135 /// @todo This should go away when we switch this to use compute threading.
136 void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
137 {
138 RDTSC_BEGIN(FEProcessStoreTiles, pDC->drawId);
139 MacroTileMgr* pTileMgr = pDC->pTileMgr;
140 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
141
142 // queue a store to each macro tile
143 // compute macro tile bounds for the specified rect
144 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
145 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
146 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
147 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
148
149 // store tiles
150 BE_WORK work;
151 work.type = STORETILES;
152 work.pfnWork = ProcessStoreTilesBE;
153 work.desc.storeTiles = *pDesc;
154
155 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
156 {
157 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
158 {
159 pTileMgr->enqueue(x, y, &work);
160 }
161 }
162
163 RDTSC_END(FEProcessStoreTiles, 0);
164 }
165
166 //////////////////////////////////////////////////////////////////////////
167 /// @brief FE handler for SwrInvalidateTiles.
168 /// @param pContext - pointer to SWR context.
169 /// @param pDC - pointer to draw context.
170 /// @param workerId - thread's worker id. Even thread has a unique id.
171 /// @param pUserData - Pointer to user data passed back to callback.
172 /// @todo This should go away when we switch this to use compute threading.
173 void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext,
174 DRAW_CONTEXT* pDC,
175 uint32_t workerId,
176 void* pUserData)
177 {
178 RDTSC_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
179 DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
180 MacroTileMgr* pTileMgr = pDC->pTileMgr;
181
182 // compute macro tile bounds for the specified rect
183 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
184 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
185 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
186 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
187
188 if (pDesc->fullTilesOnly == false)
189 {
190 // include partial tiles
191 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
192 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
193 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
194 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
195 }
196
197 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
198 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
199
200 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
201 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
202
203 // load tiles
204 BE_WORK work;
205 work.type = DISCARDINVALIDATETILES;
206 work.pfnWork = ProcessDiscardInvalidateTilesBE;
207 work.desc.discardInvalidateTiles = *pDesc;
208
209 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
210 {
211 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
212 {
213 pTileMgr->enqueue(x, y, &work);
214 }
215 }
216
217 RDTSC_END(FEProcessInvalidateTiles, 0);
218 }
219
220 //////////////////////////////////////////////////////////////////////////
221 /// @brief Computes the number of primitives given the number of verts.
222 /// @param mode - primitive topology for draw operation.
223 /// @param numPrims - number of vertices or indices for draw.
224 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
225 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
226 {
227 switch (mode)
228 {
229 case TOP_POINT_LIST:
230 return numPrims;
231 case TOP_TRIANGLE_LIST:
232 return numPrims / 3;
233 case TOP_TRIANGLE_STRIP:
234 return numPrims < 3 ? 0 : numPrims - 2;
235 case TOP_TRIANGLE_FAN:
236 return numPrims < 3 ? 0 : numPrims - 2;
237 case TOP_TRIANGLE_DISC:
238 return numPrims < 2 ? 0 : numPrims - 1;
239 case TOP_QUAD_LIST:
240 return numPrims / 4;
241 case TOP_QUAD_STRIP:
242 return numPrims < 4 ? 0 : (numPrims - 2) / 2;
243 case TOP_LINE_STRIP:
244 return numPrims < 2 ? 0 : numPrims - 1;
245 case TOP_LINE_LIST:
246 return numPrims / 2;
247 case TOP_LINE_LOOP:
248 return numPrims;
249 case TOP_RECT_LIST:
250 return numPrims / 3;
251 case TOP_LINE_LIST_ADJ:
252 return numPrims / 4;
253 case TOP_LISTSTRIP_ADJ:
254 return numPrims < 3 ? 0 : numPrims - 3;
255 case TOP_TRI_LIST_ADJ:
256 return numPrims / 6;
257 case TOP_TRI_STRIP_ADJ:
258 return numPrims < 4 ? 0 : (numPrims / 2) - 2;
259
260 case TOP_PATCHLIST_1:
261 case TOP_PATCHLIST_2:
262 case TOP_PATCHLIST_3:
263 case TOP_PATCHLIST_4:
264 case TOP_PATCHLIST_5:
265 case TOP_PATCHLIST_6:
266 case TOP_PATCHLIST_7:
267 case TOP_PATCHLIST_8:
268 case TOP_PATCHLIST_9:
269 case TOP_PATCHLIST_10:
270 case TOP_PATCHLIST_11:
271 case TOP_PATCHLIST_12:
272 case TOP_PATCHLIST_13:
273 case TOP_PATCHLIST_14:
274 case TOP_PATCHLIST_15:
275 case TOP_PATCHLIST_16:
276 case TOP_PATCHLIST_17:
277 case TOP_PATCHLIST_18:
278 case TOP_PATCHLIST_19:
279 case TOP_PATCHLIST_20:
280 case TOP_PATCHLIST_21:
281 case TOP_PATCHLIST_22:
282 case TOP_PATCHLIST_23:
283 case TOP_PATCHLIST_24:
284 case TOP_PATCHLIST_25:
285 case TOP_PATCHLIST_26:
286 case TOP_PATCHLIST_27:
287 case TOP_PATCHLIST_28:
288 case TOP_PATCHLIST_29:
289 case TOP_PATCHLIST_30:
290 case TOP_PATCHLIST_31:
291 case TOP_PATCHLIST_32:
292 return numPrims / (mode - TOP_PATCHLIST_BASE);
293
294 case TOP_POLYGON:
295 case TOP_POINT_LIST_BF:
296 case TOP_LINE_STRIP_CONT:
297 case TOP_LINE_STRIP_BF:
298 case TOP_LINE_STRIP_CONT_BF:
299 case TOP_TRIANGLE_FAN_NOSTIPPLE:
300 case TOP_TRI_STRIP_REVERSE:
301 case TOP_PATCHLIST_BASE:
302 case TOP_UNKNOWN:
303 SWR_INVALID("Unsupported topology: %d", mode);
304 return 0;
305 }
306
307 return 0;
308 }
309
310 //////////////////////////////////////////////////////////////////////////
311 /// @brief Computes the number of verts given the number of primitives.
312 /// @param mode - primitive topology for draw operation.
313 /// @param numPrims - number of primitives for draw.
314 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
315 {
316 switch (mode)
317 {
318 case TOP_POINT_LIST:
319 return numPrims;
320 case TOP_TRIANGLE_LIST:
321 return numPrims * 3;
322 case TOP_TRIANGLE_STRIP:
323 return numPrims ? numPrims + 2 : 0;
324 case TOP_TRIANGLE_FAN:
325 return numPrims ? numPrims + 2 : 0;
326 case TOP_TRIANGLE_DISC:
327 return numPrims ? numPrims + 1 : 0;
328 case TOP_QUAD_LIST:
329 return numPrims * 4;
330 case TOP_QUAD_STRIP:
331 return numPrims ? numPrims * 2 + 2 : 0;
332 case TOP_LINE_STRIP:
333 return numPrims ? numPrims + 1 : 0;
334 case TOP_LINE_LIST:
335 return numPrims * 2;
336 case TOP_LINE_LOOP:
337 return numPrims;
338 case TOP_RECT_LIST:
339 return numPrims * 3;
340 case TOP_LINE_LIST_ADJ:
341 return numPrims * 4;
342 case TOP_LISTSTRIP_ADJ:
343 return numPrims ? numPrims + 3 : 0;
344 case TOP_TRI_LIST_ADJ:
345 return numPrims * 6;
346 case TOP_TRI_STRIP_ADJ:
347 return numPrims ? (numPrims + 2) * 2 : 0;
348
349 case TOP_PATCHLIST_1:
350 case TOP_PATCHLIST_2:
351 case TOP_PATCHLIST_3:
352 case TOP_PATCHLIST_4:
353 case TOP_PATCHLIST_5:
354 case TOP_PATCHLIST_6:
355 case TOP_PATCHLIST_7:
356 case TOP_PATCHLIST_8:
357 case TOP_PATCHLIST_9:
358 case TOP_PATCHLIST_10:
359 case TOP_PATCHLIST_11:
360 case TOP_PATCHLIST_12:
361 case TOP_PATCHLIST_13:
362 case TOP_PATCHLIST_14:
363 case TOP_PATCHLIST_15:
364 case TOP_PATCHLIST_16:
365 case TOP_PATCHLIST_17:
366 case TOP_PATCHLIST_18:
367 case TOP_PATCHLIST_19:
368 case TOP_PATCHLIST_20:
369 case TOP_PATCHLIST_21:
370 case TOP_PATCHLIST_22:
371 case TOP_PATCHLIST_23:
372 case TOP_PATCHLIST_24:
373 case TOP_PATCHLIST_25:
374 case TOP_PATCHLIST_26:
375 case TOP_PATCHLIST_27:
376 case TOP_PATCHLIST_28:
377 case TOP_PATCHLIST_29:
378 case TOP_PATCHLIST_30:
379 case TOP_PATCHLIST_31:
380 case TOP_PATCHLIST_32:
381 return numPrims * (mode - TOP_PATCHLIST_BASE);
382
383 case TOP_POLYGON:
384 case TOP_POINT_LIST_BF:
385 case TOP_LINE_STRIP_CONT:
386 case TOP_LINE_STRIP_BF:
387 case TOP_LINE_STRIP_CONT_BF:
388 case TOP_TRIANGLE_FAN_NOSTIPPLE:
389 case TOP_TRI_STRIP_REVERSE:
390 case TOP_PATCHLIST_BASE:
391 case TOP_UNKNOWN:
392 SWR_INVALID("Unsupported topology: %d", mode);
393 return 0;
394 }
395
396 return 0;
397 }
398
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Return number of verts per primitive.
401 /// @param topology - topology
402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
403 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
404 {
405 uint32_t numVerts = 0;
406 switch (topology)
407 {
408 case TOP_POINT_LIST:
409 case TOP_POINT_LIST_BF:
410 numVerts = 1;
411 break;
412 case TOP_LINE_LIST:
413 case TOP_LINE_STRIP:
414 case TOP_LINE_LIST_ADJ:
415 case TOP_LINE_LOOP:
416 case TOP_LINE_STRIP_CONT:
417 case TOP_LINE_STRIP_BF:
418 case TOP_LISTSTRIP_ADJ:
419 numVerts = 2;
420 break;
421 case TOP_TRIANGLE_LIST:
422 case TOP_TRIANGLE_STRIP:
423 case TOP_TRIANGLE_FAN:
424 case TOP_TRI_LIST_ADJ:
425 case TOP_TRI_STRIP_ADJ:
426 case TOP_TRI_STRIP_REVERSE:
427 case TOP_RECT_LIST:
428 numVerts = 3;
429 break;
430 case TOP_QUAD_LIST:
431 case TOP_QUAD_STRIP:
432 numVerts = 4;
433 break;
434 case TOP_PATCHLIST_1:
435 case TOP_PATCHLIST_2:
436 case TOP_PATCHLIST_3:
437 case TOP_PATCHLIST_4:
438 case TOP_PATCHLIST_5:
439 case TOP_PATCHLIST_6:
440 case TOP_PATCHLIST_7:
441 case TOP_PATCHLIST_8:
442 case TOP_PATCHLIST_9:
443 case TOP_PATCHLIST_10:
444 case TOP_PATCHLIST_11:
445 case TOP_PATCHLIST_12:
446 case TOP_PATCHLIST_13:
447 case TOP_PATCHLIST_14:
448 case TOP_PATCHLIST_15:
449 case TOP_PATCHLIST_16:
450 case TOP_PATCHLIST_17:
451 case TOP_PATCHLIST_18:
452 case TOP_PATCHLIST_19:
453 case TOP_PATCHLIST_20:
454 case TOP_PATCHLIST_21:
455 case TOP_PATCHLIST_22:
456 case TOP_PATCHLIST_23:
457 case TOP_PATCHLIST_24:
458 case TOP_PATCHLIST_25:
459 case TOP_PATCHLIST_26:
460 case TOP_PATCHLIST_27:
461 case TOP_PATCHLIST_28:
462 case TOP_PATCHLIST_29:
463 case TOP_PATCHLIST_30:
464 case TOP_PATCHLIST_31:
465 case TOP_PATCHLIST_32:
466 numVerts = topology - TOP_PATCHLIST_BASE;
467 break;
468 default:
469 SWR_INVALID("Unsupported topology: %d", topology);
470 break;
471 }
472
473 if (includeAdjVerts)
474 {
475 switch (topology)
476 {
477 case TOP_LISTSTRIP_ADJ:
478 case TOP_LINE_LIST_ADJ:
479 numVerts = 4;
480 break;
481 case TOP_TRI_STRIP_ADJ:
482 case TOP_TRI_LIST_ADJ:
483 numVerts = 6;
484 break;
485 default:
486 break;
487 }
488 }
489
490 return numVerts;
491 }
492
493 //////////////////////////////////////////////////////////////////////////
494 /// @brief Generate mask from remaining work.
495 /// @param numWorkItems - Number of items being worked on by a SIMD.
496 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
497 {
498 uint32_t numActive =
499 (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
500 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
501 return _simd_castps_si(_simd_vmask_ps(mask));
502 }
503
504 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
505 {
506 uint32_t numActive =
507 (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
508 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
509 return _simd16_castps_si(_simd16_vmask_ps(mask));
510 }
511
512 //////////////////////////////////////////////////////////////////////////
513 /// @brief StreamOut - Streams vertex data out to SO buffers.
514 /// Generally, we are only streaming out a SIMDs worth of triangles.
515 /// @param pDC - pointer to draw context.
516 /// @param workerId - thread's worker id. Even thread has a unique id.
517 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
518 static void StreamOut(
519 DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
520 {
521 RDTSC_BEGIN(FEStreamout, pDC->drawId);
522
523 const API_STATE& state = GetApiState(pDC);
524 const SWR_STREAMOUT_STATE& soState = state.soState;
525
526 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
527
528 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
529 // vertex.
530 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
531
532 SWR_STREAMOUT_CONTEXT soContext = {0};
533
534 // Setup buffer state pointers.
535 for (uint32_t i = 0; i < 4; ++i)
536 {
537 soContext.pBuffer[i] = &state.soBuffer[i];
538 }
539
540 uint32_t numPrims = pa.NumPrims();
541
542 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
543 {
544 DWORD slot = 0;
545 uint64_t soMask = soState.streamMasks[streamIndex];
546
547 // Write all entries into primitive data buffer for SOS.
548 while (_BitScanForward64(&slot, soMask))
549 {
550 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
551 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
552 pa.AssembleSingle(paSlot, primIndex, attrib);
553
554 // Attribute offset is relative offset from start of vertex.
555 // Note that attributes start at slot 1 in the PA buffer. We need to write this
556 // to prim data starting at slot 0. Which is why we do (slot - 1).
557 // Also note: GL works slightly differently, and needs slot 0
558 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
559
560 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
561 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
562 {
563 uint32_t* pPrimDataAttrib =
564 pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
565
566 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
567 }
568
569 soMask &= ~(uint64_t(1) << slot);
570 }
571
572 // Update pPrimData pointer
573 soContext.pPrimData = pPrimData;
574
575 // Call SOS
576 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
577 "Trying to execute uninitialized streamout jit function.");
578 state.pfnSoFunc[streamIndex](soContext);
579 }
580
581 // Update SO write offset. The driver provides memory for the update.
582 for (uint32_t i = 0; i < 4; ++i)
583 {
584 if (state.soBuffer[i].pWriteOffset)
585 {
586 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
587 }
588
589 if (state.soBuffer[i].soWriteEnable)
590 {
591 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
592 pDC->dynState.SoWriteOffsetDirty[i] = true;
593 }
594 }
595
596 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
597 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
598
599 RDTSC_END(FEStreamout, 1);
600 }
601
602 #if USE_SIMD16_FRONTEND
603 //////////////////////////////////////////////////////////////////////////
604 /// Is value an even number (a multiple of two)
605 ///
606 template <typename T>
607 INLINE static bool IsEven(T value)
608 {
609 return (value & 1) == 0;
610 }
611
612 //////////////////////////////////////////////////////////////////////////
613 /// Round up value to an even number (a multiple of two)
614 ///
615 template <typename T>
616 INLINE static T RoundUpEven(T value)
617 {
618 return (value + 1) & ~1;
619 }
620
621 //////////////////////////////////////////////////////////////////////////
622 /// Round down value to an even number (a multiple of two)
623 ///
624 template <typename T>
625 INLINE static T RoundDownEven(T value)
626 {
627 return value & ~1;
628 }
629
630 //////////////////////////////////////////////////////////////////////////
631 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
632 ///
633 /// vertexCount is in terms of the source simdvertexes and must be even
634 ///
635 /// attribCount will limit the vector copies to those attribs specified
636 ///
637 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
638 ///
639 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex* vertex_simd16,
640 const simdvertex* vertex,
641 uint32_t vertexCount,
642 uint32_t attribCount)
643 {
644 SWR_ASSERT(vertex);
645 SWR_ASSERT(vertex_simd16);
646 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
647
648 simd16vertex temp;
649
650 for (uint32_t i = 0; i < vertexCount; i += 2)
651 {
652 for (uint32_t j = 0; j < attribCount; j += 1)
653 {
654 for (uint32_t k = 0; k < 4; k += 1)
655 {
656 temp.attrib[j][k] =
657 _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
658
659 if ((i + 1) < vertexCount)
660 {
661 temp.attrib[j][k] =
662 _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
663 }
664 }
665 }
666
667 for (uint32_t j = 0; j < attribCount; j += 1)
668 {
669 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
670 }
671 }
672 }
673
674 #endif
675 //////////////////////////////////////////////////////////////////////////
676 /// @brief Computes number of invocations. The current index represents
677 /// the start of the SIMD. The max index represents how much work
678 /// items are remaining. If there is less then a SIMD's xmin of work
679 /// then return the remaining amount of work.
680 /// @param curIndex - The start index for the SIMD.
681 /// @param maxIndex - The last index for all work items.
682 static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
683 {
684 uint32_t remainder = (maxIndex - curIndex);
685 #if USE_SIMD16_FRONTEND
686 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
687 #else
688 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
689 #endif
690 }
691
692 //////////////////////////////////////////////////////////////////////////
693 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
694 /// The geometry shader will loop over each active streamout buffer, assembling
695 /// primitives for the downstream stages. When multistream output is enabled,
696 /// the generated stream ID buffer from the GS needs to be converted to a cut
697 /// buffer for the primitive assembler.
698 /// @param stream - stream id to generate the cut buffer for
699 /// @param pStreamIdBase - pointer to the stream ID buffer
700 /// @param numEmittedVerts - Number of total verts emitted by the GS
701 /// @param pCutBuffer - output buffer to write cuts to
702 void ProcessStreamIdBuffer(uint32_t stream,
703 uint8_t* pStreamIdBase,
704 uint32_t numEmittedVerts,
705 uint8_t* pCutBuffer)
706 {
707 SWR_ASSERT(stream < MAX_SO_STREAMS);
708
709 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
710 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
711
712 for (uint32_t b = 0; b < numOutputBytes; ++b)
713 {
714 uint8_t curInputByte = pStreamIdBase[2 * b];
715 uint8_t outByte = 0;
716 for (uint32_t i = 0; i < 4; ++i)
717 {
718 if ((curInputByte & 0x3) != stream)
719 {
720 outByte |= (1 << i);
721 }
722 curInputByte >>= 2;
723 }
724
725 curInputByte = pStreamIdBase[2 * b + 1];
726 for (uint32_t i = 0; i < 4; ++i)
727 {
728 if ((curInputByte & 0x3) != stream)
729 {
730 outByte |= (1 << (i + 4));
731 }
732 curInputByte >>= 2;
733 }
734
735 *pCutBuffer++ = outByte;
736 }
737 }
738
739 // Buffers that are allocated if GS is enabled
740 struct GsBuffers
741 {
742 uint8_t* pGsIn;
743 uint8_t* pGsOut[KNOB_SIMD_WIDTH];
744 uint8_t* pGsTransposed;
745 void* pStreamCutBuffer;
746 };
747
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
750 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
751 /// assembler
752 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
753 /// @param numVerts - Number of vertices outputted by the GS
754 /// @param numAttribs - Number of attributes per vertex
755 template <typename SIMD_T, uint32_t SimdWidth>
756 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
757 {
758 uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
759 uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
760
761 OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
762
763 for (uint32_t i = 0; i < SimdWidth; ++i)
764 {
765 gatherOffsets[i] = srcVertexStride * i;
766 }
767 auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
768
769 uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
770 uint32_t remainingVerts = numVerts;
771
772 for (uint32_t s = 0; s < numSimd; ++s)
773 {
774 uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
775 uint8_t* pDstBase = pDst + s * dstVertexStride;
776
777 // Compute mask to prevent src overflow
778 uint32_t mask = std::min(remainingVerts, SimdWidth);
779 mask = GenMask(mask);
780 auto vMask = SIMD_T::vmask_ps(mask);
781 auto viMask = SIMD_T::castps_si(vMask);
782
783 for (uint32_t a = 0; a < numAttribs; ++a)
784 {
785 auto attribGatherX = SIMD_T::mask_i32gather_ps(
786 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
787 auto attribGatherY = SIMD_T::mask_i32gather_ps(
788 SIMD_T::setzero_ps(),
789 (const float*)(pSrcBase + sizeof(float)),
790 vGatherOffsets,
791 vMask);
792 auto attribGatherZ = SIMD_T::mask_i32gather_ps(
793 SIMD_T::setzero_ps(),
794 (const float*)(pSrcBase + sizeof(float) * 2),
795 vGatherOffsets,
796 vMask);
797 auto attribGatherW = SIMD_T::mask_i32gather_ps(
798 SIMD_T::setzero_ps(),
799 (const float*)(pSrcBase + sizeof(float) * 3),
800 vGatherOffsets,
801 vMask);
802
803 SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
804 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
805 SIMD_T::maskstore_ps(
806 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
807 SIMD_T::maskstore_ps(
808 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
809
810 pSrcBase += sizeof(float) * 4;
811 pDstBase += sizeof(Float<SIMD_T>) * 4;
812 }
813 remainingVerts -= SimdWidth;
814 }
815 }
816
817
818 //////////////////////////////////////////////////////////////////////////
819 /// @brief Implements GS stage.
820 /// @param pDC - pointer to draw context.
821 /// @param workerId - thread's worker id. Even thread has a unique id.
822 /// @param pa - The primitive assembly object.
823 /// @param pGsOut - output stream for GS
824 template <typename HasStreamOutT, typename HasRastT>
825 static void GeometryShaderStage(DRAW_CONTEXT* pDC,
826 uint32_t workerId,
827 PA_STATE& pa,
828 GsBuffers* pGsBuffers,
829 uint32_t* pSoPrimData,
830 #if USE_SIMD16_FRONTEND
831 uint32_t numPrims_simd8,
832 #endif
833 simdscalari const& primID)
834 {
835 RDTSC_BEGIN(FEGeometryShader, pDC->drawId);
836
837 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
838
839 const API_STATE& state = GetApiState(pDC);
840 const SWR_GS_STATE* pState = &state.gsState;
841 SWR_GS_CONTEXT gsContext;
842
843 static uint8_t sNullBuffer[128] = {0};
844
845 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
846 {
847 gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
848 }
849 gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
850 gsContext.PrimitiveID = primID;
851
852 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
853 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
854
855 // assemble all attributes for the input primitive
856 gsContext.inputVertStride = pState->inputVertStride;
857 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
858 {
859 uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
860 uint32_t attribSlot = pState->vertexAttribOffset + slot;
861 pa.Assemble(srcAttribSlot, attrib);
862
863 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
864 {
865 gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
866 }
867 }
868
869 // assemble position
870 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
871 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
872 {
873 gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
874 }
875
876 // record valid prims from the frontend to avoid over binning the newly generated
877 // prims from the GS
878 #if USE_SIMD16_FRONTEND
879 uint32_t numInputPrims = numPrims_simd8;
880 #else
881 uint32_t numInputPrims = pa.NumPrims();
882 #endif
883
884 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
885 {
886 gsContext.InstanceID = instance;
887 gsContext.mask = GenerateMask(numInputPrims);
888
889 // execute the geometry shader
890 state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
891 AR_EVENT(GSStats((HANDLE)&gsContext.stats));
892
893 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
894 {
895 gsContext.pStreams[i] += pState->allocationSize;
896 }
897 }
898
899 // set up new binner and state for the GS output topology
900 #if USE_SIMD16_FRONTEND
901 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
902 if (HasRastT::value)
903 {
904 switch (pState->outputTopology)
905 {
906 case TOP_RECT_LIST:
907 pfnClipFunc = ClipRectangles_simd16;
908 break;
909 case TOP_TRIANGLE_STRIP:
910 pfnClipFunc = ClipTriangles_simd16;
911 break;
912 case TOP_LINE_STRIP:
913 pfnClipFunc = ClipLines_simd16;
914 break;
915 case TOP_POINT_LIST:
916 pfnClipFunc = ClipPoints_simd16;
917 break;
918 default:
919 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
920 }
921 }
922
923 #else
924 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
925 if (HasRastT::value)
926 {
927 switch (pState->outputTopology)
928 {
929 case TOP_RECT_LIST:
930 pfnClipFunc = ClipRectangles;
931 break;
932 case TOP_TRIANGLE_STRIP:
933 pfnClipFunc = ClipTriangles;
934 break;
935 case TOP_LINE_STRIP:
936 pfnClipFunc = ClipLines;
937 break;
938 case TOP_POINT_LIST:
939 pfnClipFunc = ClipPoints;
940 break;
941 default:
942 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
943 }
944 }
945
946 #endif
947 // foreach input prim:
948 // - setup a new PA based on the emitted verts for that prim
949 // - loop over the new verts, calling PA to assemble each prim
950 uint32_t* pPrimitiveId = (uint32_t*)&primID;
951
952 uint32_t totalPrimsGenerated = 0;
953 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
954 {
955 uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
956
957 // Vertex count is either emitted by shader or static
958 uint32_t vertexCount = 0;
959 if (pState->staticVertexCount)
960 {
961 vertexCount = pState->staticVertexCount;
962 }
963 else
964 {
965 // If emitted in shader, it should be the stored in the first dword of the output buffer
966 vertexCount = *(uint32_t*)pInstanceBase;
967 }
968
969 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
970 {
971 uint32_t numEmittedVerts = vertexCount;
972 if (numEmittedVerts == 0)
973 {
974 continue;
975 }
976
977 uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
978 uint8_t* pCutBase =
979 pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
980 uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
981
982 #if USE_SIMD16_FRONTEND
983 TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
984 pVertexBaseAOS,
985 vertexCount,
986 pState->outputVertexSize);
987 #else
988 TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
989 pVertexBaseAOS,
990 vertexCount,
991 pState->outputVertexSize);
992 #endif
993
994 uint32_t numAttribs = state.feNumAttributes;
995
996 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
997 {
998 bool processCutVerts = false;
999 uint8_t* pCutBuffer = pCutBase;
1000
1001 // assign default stream ID, only relevant when GS is outputting a single stream
1002 uint32_t streamID = 0;
1003 if (pState->isSingleStream)
1004 {
1005 processCutVerts = true;
1006 streamID = pState->singleStreamID;
1007 if (streamID != stream)
1008 continue;
1009 }
1010 else
1011 {
1012 // early exit if this stream is not enabled for streamout
1013 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
1014 {
1015 continue;
1016 }
1017
1018 // multi-stream output, need to translate StreamID buffer to a cut buffer
1019 ProcessStreamIdBuffer(
1020 stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
1021 pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
1022 processCutVerts = false;
1023 }
1024
1025 #if USE_SIMD16_FRONTEND
1026 PA_STATE_CUT gsPa(pDC,
1027 (uint8_t*)pGsBuffers->pGsTransposed,
1028 numEmittedVerts,
1029 pState->outputVertexSize,
1030 reinterpret_cast<simd16mask*>(pCutBuffer),
1031 numEmittedVerts,
1032 numAttribs,
1033 pState->outputTopology,
1034 processCutVerts,
1035 pa.numVertsPerPrim);
1036
1037 #else
1038 PA_STATE_CUT gsPa(pDC,
1039 (uint8_t*)pGsBuffers->pGsTransposed,
1040 numEmittedVerts,
1041 pState->outputVertexSize,
1042 pCutBuffer,
1043 numEmittedVerts,
1044 numAttribs,
1045 pState->outputTopology,
1046 processCutVerts,
1047 pa.numVertsPerPrim);
1048
1049 #endif
1050 while (gsPa.GetNextStreamOutput())
1051 {
1052 do
1053 {
1054 #if USE_SIMD16_FRONTEND
1055 simd16vector attrib_simd16[3];
1056
1057 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
1058
1059 #else
1060 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
1061
1062 #endif
1063 if (assemble)
1064 {
1065 totalPrimsGenerated += gsPa.NumPrims();
1066
1067 if (HasStreamOutT::value)
1068 {
1069 #if ENABLE_AVX512_SIMD16
1070 gsPa.useAlternateOffset = false;
1071 #endif
1072 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
1073 }
1074
1075 if (HasRastT::value && state.soState.streamToRasterizer == stream)
1076 {
1077 #if USE_SIMD16_FRONTEND
1078 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
1079
1080 // Gather data from the SVG if provided.
1081 simd16scalari vViewportIdx = SIMD16::setzero_si();
1082 simd16scalari vRtIdx = SIMD16::setzero_si();
1083 SIMD16::Vec4 svgAttrib[4];
1084
1085 if (state.backendState.readViewportArrayIndex ||
1086 state.backendState.readRenderTargetArrayIndex)
1087 {
1088 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1089 }
1090
1091 if (state.backendState.readViewportArrayIndex)
1092 {
1093 vViewportIdx =
1094 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1095 gsPa.viewportArrayActive = true;
1096 }
1097 if (state.backendState.readRenderTargetArrayIndex)
1098 {
1099 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1100 gsPa.rtArrayActive = true;
1101 }
1102
1103 {
1104 // OOB VPAI indices => forced to zero.
1105 vViewportIdx =
1106 SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1107 simd16scalari vNumViewports =
1108 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1109 simd16scalari vClearMask =
1110 SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1111 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1112
1113 gsPa.useAlternateOffset = false;
1114 pfnClipFunc(pDC,
1115 gsPa,
1116 workerId,
1117 attrib_simd16,
1118 GenMask(gsPa.NumPrims()),
1119 vPrimId,
1120 vViewportIdx,
1121 vRtIdx);
1122 }
1123 #else
1124 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1125
1126 // Gather data from the SVG if provided.
1127 simdscalari vViewportIdx = SIMD::setzero_si();
1128 simdscalari vRtIdx = SIMD::setzero_si();
1129 SIMD::Vec4 svgAttrib[4];
1130
1131 if (state.backendState.readViewportArrayIndex ||
1132 state.backendState.readRenderTargetArrayIndex)
1133 {
1134 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1135 }
1136
1137 if (state.backendState.readViewportArrayIndex)
1138 {
1139 vViewportIdx =
1140 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1141
1142 // OOB VPAI indices => forced to zero.
1143 vViewportIdx =
1144 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1145 simdscalari vNumViewports =
1146 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1147 simdscalari vClearMask =
1148 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1149 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1150 gsPa.viewportArrayActive = true;
1151 }
1152 if (state.backendState.readRenderTargetArrayIndex)
1153 {
1154 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1155 gsPa.rtArrayActive = true;
1156 }
1157
1158 pfnClipFunc(pDC,
1159 gsPa,
1160 workerId,
1161 attrib,
1162 GenMask(gsPa.NumPrims()),
1163 vPrimId,
1164 vViewportIdx,
1165 vRtIdx);
1166 #endif
1167 }
1168 }
1169 } while (gsPa.NextPrim());
1170 }
1171 }
1172 }
1173 }
1174
1175 // update GS pipeline stats
1176 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1177 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1178 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
1179 RDTSC_END(FEGeometryShader, 1);
1180 }
1181
1182 //////////////////////////////////////////////////////////////////////////
1183 /// @brief Allocate GS buffers
1184 /// @param pDC - pointer to draw context.
1185 /// @param state - API state
1186 /// @param ppGsOut - pointer to GS output buffer allocation
1187 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1188 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1189 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC,
1190 const API_STATE& state,
1191 uint32_t vertsPerPrim,
1192 GsBuffers* pGsBuffers)
1193 {
1194 auto pArena = pDC->pArena;
1195 SWR_ASSERT(pArena != nullptr);
1196 SWR_ASSERT(state.gsState.gsEnable);
1197
1198 const SWR_GS_STATE& gsState = state.gsState;
1199
1200 // Allocate storage for vertex inputs
1201 uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1202 pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1203
1204 // Allocate arena space to hold GS output verts
1205 const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1206
1207 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1208 {
1209 pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1210 }
1211
1212 // Allocate storage for transposed GS output
1213 uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1214 uint32_t transposedBufferSize =
1215 numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1216 pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1217
1218 // Allocate storage to hold temporary stream->cut buffer, if necessary
1219 if (state.gsState.isSingleStream)
1220 {
1221 pGsBuffers->pStreamCutBuffer = nullptr;
1222 }
1223 else
1224 {
1225 pGsBuffers->pStreamCutBuffer =
1226 (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1227 }
1228 }
1229
1230 //////////////////////////////////////////////////////////////////////////
1231 /// @brief Contains all data generated by the HS and passed to the
1232 /// tessellator and DS.
1233 struct TessellationThreadLocalData
1234 {
1235 SWR_HS_CONTEXT hsContext;
1236 ScalarPatch patchData[KNOB_SIMD_WIDTH];
1237 void* pTxCtx;
1238 size_t tsCtxSize;
1239
1240 simdscalar* pDSOutput;
1241 size_t dsOutputAllocSize;
1242 };
1243
1244 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1245
1246 //////////////////////////////////////////////////////////////////////////
1247 /// @brief Allocate tessellation data for this worker thread.
1248 INLINE
1249 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1250 {
1251 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1252 if (gt_pTessellationThreadData == nullptr)
1253 {
1254 gt_pTessellationThreadData =
1255 (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1256 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1257 }
1258 }
1259
1260 //////////////////////////////////////////////////////////////////////////
1261 /// @brief Implements Tessellation Stages.
1262 /// @param pDC - pointer to draw context.
1263 /// @param workerId - thread's worker id. Even thread has a unique id.
1264 /// @param pa - The primitive assembly object.
1265 /// @param pGsOut - output stream for GS
1266 template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
1267 static void TessellationStages(DRAW_CONTEXT* pDC,
1268 uint32_t workerId,
1269 PA_STATE& pa,
1270 GsBuffers* pGsBuffers,
1271 uint32_t* pSoPrimData,
1272 #if USE_SIMD16_FRONTEND
1273 uint32_t numPrims_simd8,
1274 #endif
1275 simdscalari const& primID)
1276 {
1277 const API_STATE& state = GetApiState(pDC);
1278 const SWR_TS_STATE& tsState = state.tsState;
1279 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1280
1281 SWR_ASSERT(gt_pTessellationThreadData);
1282
1283 HANDLE tsCtx = TSInitCtx(tsState.domain,
1284 tsState.partitioning,
1285 tsState.tsOutputTopology,
1286 gt_pTessellationThreadData->pTxCtx,
1287 gt_pTessellationThreadData->tsCtxSize);
1288 if (tsCtx == nullptr)
1289 {
1290 gt_pTessellationThreadData->pTxCtx =
1291 AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1292 tsCtx = TSInitCtx(tsState.domain,
1293 tsState.partitioning,
1294 tsState.tsOutputTopology,
1295 gt_pTessellationThreadData->pTxCtx,
1296 gt_pTessellationThreadData->tsCtxSize);
1297 }
1298 SWR_ASSERT(tsCtx);
1299
1300 #if USE_SIMD16_FRONTEND
1301 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1302 if (HasRastT::value)
1303 {
1304 switch (tsState.postDSTopology)
1305 {
1306 case TOP_TRIANGLE_LIST:
1307 pfnClipFunc = ClipTriangles_simd16;
1308 break;
1309 case TOP_LINE_LIST:
1310 pfnClipFunc = ClipLines_simd16;
1311 break;
1312 case TOP_POINT_LIST:
1313 pfnClipFunc = ClipPoints_simd16;
1314 break;
1315 default:
1316 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1317 }
1318 }
1319
1320 #else
1321 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1322 if (HasRastT::value)
1323 {
1324 switch (tsState.postDSTopology)
1325 {
1326 case TOP_TRIANGLE_LIST:
1327 pfnClipFunc = ClipTriangles;
1328 break;
1329 case TOP_LINE_LIST:
1330 pfnClipFunc = ClipLines;
1331 break;
1332 case TOP_POINT_LIST:
1333 pfnClipFunc = ClipPoints;
1334 break;
1335 default:
1336 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1337 }
1338 }
1339
1340 #endif
1341 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1342 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1343 hsContext.PrimitiveID = primID;
1344
1345 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1346 // Max storage for one attribute for an entire simdprimitive
1347 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1348
1349 // assemble all attributes for the input primitives
1350 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1351 {
1352 uint32_t attribSlot = tsState.vertexAttribOffset + slot;
1353 pa.Assemble(attribSlot, simdattrib);
1354
1355 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1356 {
1357 hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
1358 }
1359 }
1360
1361 #if defined(_DEBUG)
1362 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1363 #endif
1364
1365 #if USE_SIMD16_FRONTEND
1366 uint32_t numPrims = numPrims_simd8;
1367 #else
1368 uint32_t numPrims = pa.NumPrims();
1369 #endif
1370 hsContext.mask = GenerateMask(numPrims);
1371
1372 // Run the HS
1373 RDTSC_BEGIN(FEHullShader, pDC->drawId);
1374 state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
1375 RDTSC_END(FEHullShader, 0);
1376
1377 UPDATE_STAT_FE(HsInvocations, numPrims);
1378 AR_EVENT(HSStats((HANDLE)&hsContext.stats));
1379
1380 const uint32_t* pPrimId = (const uint32_t*)&primID;
1381
1382 for (uint32_t p = 0; p < numPrims; ++p)
1383 {
1384 // Run Tessellator
1385 SWR_TS_TESSELLATED_DATA tsData = {0};
1386 RDTSC_BEGIN(FETessellation, pDC->drawId);
1387 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1388 AR_EVENT(TessPrimCount(1));
1389 RDTSC_END(FETessellation, 0);
1390
1391 if (tsData.NumPrimitives == 0)
1392 {
1393 continue;
1394 }
1395 SWR_ASSERT(tsData.NumDomainPoints);
1396
1397 // Allocate DS Output memory
1398 uint32_t requiredDSVectorInvocations =
1399 AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1400 #if USE_SIMD16_FRONTEND
1401 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
1402 tsState.dsAllocationSize; // simd8 -> simd16, padding
1403 #else
1404 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1405 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1406 #endif
1407 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1408 {
1409 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1410 gt_pTessellationThreadData->pDSOutput =
1411 (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1412 gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1413 }
1414 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1415 SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1416
1417 #if defined(_DEBUG)
1418 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1419 #endif
1420
1421 // Run Domain Shader
1422 SWR_DS_CONTEXT dsContext;
1423 dsContext.PrimitiveID = pPrimId[p];
1424 dsContext.pCpIn = &hsContext.pCPout[p];
1425 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1426 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1427 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1428 dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1429 #if USE_SIMD16_FRONTEND
1430 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1431 #else
1432 dsContext.vectorStride = requiredDSVectorInvocations;
1433 #endif
1434
1435 uint32_t dsInvocations = 0;
1436
1437 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
1438 ++dsContext.vectorOffset)
1439 {
1440 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1441
1442 RDTSC_BEGIN(FEDomainShader, pDC->drawId);
1443 state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
1444 RDTSC_END(FEDomainShader, 0);
1445
1446 AR_EVENT(DSStats((HANDLE)&dsContext.stats));
1447
1448 dsInvocations += KNOB_SIMD_WIDTH;
1449 }
1450 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1451
1452 #if USE_SIMD16_FRONTEND
1453 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1454
1455 #endif
1456 PA_TESS tessPa(
1457 pDC,
1458 #if USE_SIMD16_FRONTEND
1459 reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
1460 dsContext.vectorStride / 2, // simd8 -> simd16
1461 #else
1462 dsContext.pOutputData,
1463 dsContext.vectorStride,
1464 #endif
1465 SWR_VTX_NUM_SLOTS,
1466 tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1467 tsData.ppIndices,
1468 tsData.NumPrimitives,
1469 tsState.postDSTopology,
1470 NumVertsPerPrim(tsState.postDSTopology, false));
1471
1472 while (tessPa.HasWork())
1473 {
1474 #if USE_SIMD16_FRONTEND
1475 const uint32_t numPrims = tessPa.NumPrims();
1476 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1477 const uint32_t numPrims_hi =
1478 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1479
1480 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1481 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1482 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1483
1484 #endif
1485 if (HasGeometryShaderT::value)
1486 {
1487 #if USE_SIMD16_FRONTEND
1488 tessPa.useAlternateOffset = false;
1489 GeometryShaderStage<HasStreamOutT, HasRastT>(
1490 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1491
1492 if (numPrims_hi)
1493 {
1494 tessPa.useAlternateOffset = true;
1495 GeometryShaderStage<HasStreamOutT, HasRastT>(
1496 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1497 }
1498 #else
1499 GeometryShaderStage<HasStreamOutT, HasRastT>(
1500 pDC,
1501 workerId,
1502 tessPa,
1503 pGsBuffers,
1504 pSoPrimData,
1505 _simd_set1_epi32(dsContext.PrimitiveID));
1506 #endif
1507 }
1508 else
1509 {
1510 if (HasStreamOutT::value)
1511 {
1512 #if ENABLE_AVX512_SIMD16
1513 tessPa.useAlternateOffset = false;
1514 #endif
1515 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1516 }
1517
1518 if (HasRastT::value)
1519 {
1520 #if USE_SIMD16_FRONTEND
1521 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1522 #else
1523 simdvector prim[3]; // Only deal with triangles, lines, or points
1524 #endif
1525 RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
1526 bool assemble =
1527 #if USE_SIMD16_FRONTEND
1528 tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1529 #else
1530 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1531 #endif
1532 RDTSC_END(FEPAAssemble, 1);
1533 SWR_ASSERT(assemble);
1534
1535 SWR_ASSERT(pfnClipFunc);
1536 #if USE_SIMD16_FRONTEND
1537 // Gather data from the SVG if provided.
1538 simd16scalari vViewportIdx = SIMD16::setzero_si();
1539 simd16scalari vRtIdx = SIMD16::setzero_si();
1540 SIMD16::Vec4 svgAttrib[4];
1541
1542 if (state.backendState.readViewportArrayIndex ||
1543 state.backendState.readRenderTargetArrayIndex)
1544 {
1545 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1546 }
1547
1548 if (state.backendState.readViewportArrayIndex)
1549 {
1550 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1551 tessPa.viewportArrayActive = true;
1552 }
1553 if (state.backendState.readRenderTargetArrayIndex)
1554 {
1555 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1556 tessPa.rtArrayActive = true;
1557 }
1558
1559
1560 {
1561 // OOB VPAI indices => forced to zero.
1562 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1563 simd16scalari vNumViewports =
1564 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1565 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1566 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1567
1568 tessPa.useAlternateOffset = false;
1569 pfnClipFunc(pDC,
1570 tessPa,
1571 workerId,
1572 prim_simd16,
1573 GenMask(numPrims),
1574 primID,
1575 vViewportIdx,
1576 vRtIdx);
1577 }
1578 #else
1579 // Gather data from the SGV if provided.
1580 simdscalari vViewportIdx = SIMD::setzero_si();
1581 simdscalari vRtIdx = SIMD::setzero_si();
1582 SIMD::Vec4 svgAttrib[4];
1583
1584 if (state.backendState.readViewportArrayIndex ||
1585 state.backendState.readRenderTargetArrayIndex)
1586 {
1587 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1588 }
1589
1590 if (state.backendState.readViewportArrayIndex)
1591 {
1592 vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1593
1594 // OOB VPAI indices => forced to zero.
1595 vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1596 simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1597 simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1598 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1599 tessPa.viewportArrayActive = true;
1600 }
1601 if (state.backendState.readRenderTargetArrayIndex)
1602 {
1603 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1604 tessPa.rtArrayActive = true;
1605 }
1606 pfnClipFunc(pDC,
1607 tessPa,
1608 workerId,
1609 prim,
1610 GenMask(tessPa.NumPrims()),
1611 _simd_set1_epi32(dsContext.PrimitiveID),
1612 vViewportIdx,
1613 vRtIdx);
1614 #endif
1615 }
1616 }
1617
1618 tessPa.NextPrim();
1619
1620 } // while (tessPa.HasWork())
1621 } // for (uint32_t p = 0; p < numPrims; ++p)
1622
1623 #if USE_SIMD16_FRONTEND
1624 if (gt_pTessellationThreadData->pDSOutput != nullptr)
1625 {
1626 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1627 gt_pTessellationThreadData->pDSOutput = nullptr;
1628 }
1629 gt_pTessellationThreadData->dsOutputAllocSize = 0;
1630
1631 #endif
1632 TSDestroyCtx(tsCtx);
1633 }
1634
1635 THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
1636 THREAD uint32_t gVertexStoreSize = 0;
1637
1638 //////////////////////////////////////////////////////////////////////////
1639 /// @brief FE handler for SwrDraw.
1640 /// @tparam IsIndexedT - Is indexed drawing enabled
1641 /// @tparam HasTessellationT - Is tessellation enabled
1642 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1643 /// @tparam HasStreamOutT - Is stream-out enabled
1644 /// @tparam HasRastT - Is rasterization enabled
1645 /// @param pContext - pointer to SWR context.
1646 /// @param pDC - pointer to draw context.
1647 /// @param workerId - thread's worker id.
1648 /// @param pUserData - Pointer to DRAW_WORK
1649 template <typename IsIndexedT,
1650 typename IsCutIndexEnabledT,
1651 typename HasTessellationT,
1652 typename HasGeometryShaderT,
1653 typename HasStreamOutT,
1654 typename HasRastT>
1655 void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
1656 {
1657 #if KNOB_ENABLE_TOSS_POINTS
1658 if (KNOB_TOSS_QUEUE_FE)
1659 {
1660 return;
1661 }
1662 #endif
1663
1664 RDTSC_BEGIN(FEProcessDraw, pDC->drawId);
1665
1666 void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1667
1668 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1669 const API_STATE& state = GetApiState(pDC);
1670
1671 uint32_t indexSize = 0;
1672 uint32_t endVertex = work.numVerts;
1673
1674 gfxptr_t xpLastRequestedIndex = 0;
1675 if (IsIndexedT::value)
1676 {
1677 switch (work.type)
1678 {
1679 case R32_UINT:
1680 indexSize = sizeof(uint32_t);
1681 break;
1682 case R16_UINT:
1683 indexSize = sizeof(uint16_t);
1684 break;
1685 case R8_UINT:
1686 indexSize = sizeof(uint8_t);
1687 break;
1688 default:
1689 SWR_INVALID("Invalid work.type: %d", work.type);
1690 }
1691 xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1692 }
1693 else
1694 {
1695 // No cuts, prune partial primitives.
1696 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1697 }
1698
1699 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1700 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1701 #endif
1702
1703 GsBuffers gsBuffers;
1704 if (HasGeometryShaderT::value)
1705 {
1706 #if USE_SIMD16_FRONTEND
1707 AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
1708 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1709 #else
1710 AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
1711 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1712 #endif
1713 }
1714
1715 if (HasTessellationT::value)
1716 {
1717 SWR_ASSERT(state.tsState.tsEnable == true);
1718 SWR_ASSERT(state.pfnHsFunc != nullptr);
1719 SWR_ASSERT(state.pfnDsFunc != nullptr);
1720
1721 AllocateTessellationData(pContext);
1722 }
1723 else
1724 {
1725 SWR_ASSERT(state.tsState.tsEnable == false);
1726 SWR_ASSERT(state.pfnHsFunc == nullptr);
1727 SWR_ASSERT(state.pfnDsFunc == nullptr);
1728 }
1729
1730 // allocate space for streamout input prim data
1731 uint32_t* pSoPrimData = nullptr;
1732 if (HasStreamOutT::value)
1733 {
1734 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1735 }
1736
1737 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1738 #if USE_SIMD16_FRONTEND
1739 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1740 #else
1741 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1742 #endif
1743
1744 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1745
1746 // Compute storage requirements for vertex store
1747 // TODO: allocation needs to be rethought for better cut support
1748 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1749 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1750
1751 // grow the vertex store for the PA as necessary
1752 if (gVertexStoreSize < vertexStoreSize)
1753 {
1754 if (gpVertexStore != nullptr)
1755 {
1756 AlignedFree(gpVertexStore);
1757 gpVertexStore = nullptr;
1758 }
1759
1760 SWR_ASSERT(gpVertexStore == nullptr);
1761
1762 gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
1763 gVertexStoreSize = vertexStoreSize;
1764
1765 SWR_ASSERT(gpVertexStore != nullptr);
1766 }
1767
1768 // choose primitive assembler
1769
1770 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
1771 state.topology,
1772 work.numVerts,
1773 gpVertexStore,
1774 numVerts,
1775 state.frontendState.vsVertexSize,
1776 GetNumVerts(state.topology, 1));
1777 PA_STATE& pa = paFactory.GetPA();
1778
1779 #if USE_SIMD16_FRONTEND
1780 #if USE_SIMD16_SHADERS
1781 simd16vertex vin;
1782 #else
1783 simdvertex vin_lo;
1784 simdvertex vin_hi;
1785 #endif
1786 SWR_VS_CONTEXT vsContext_lo;
1787 SWR_VS_CONTEXT vsContext_hi;
1788
1789 #if USE_SIMD16_SHADERS
1790 vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
1791 vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
1792 #else
1793 vsContext_lo.pVin = &vin_lo;
1794 vsContext_hi.pVin = &vin_hi;
1795 #endif
1796 vsContext_lo.AlternateOffset = 0;
1797 vsContext_hi.AlternateOffset = 1;
1798
1799 SWR_FETCH_CONTEXT fetchInfo_lo = {0};
1800
1801 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1802 fetchInfo_lo.StartInstance = work.startInstance;
1803 fetchInfo_lo.StartVertex = 0;
1804
1805 if (IsIndexedT::value)
1806 {
1807 fetchInfo_lo.BaseVertex = work.baseVertex;
1808
1809 // if the entire index buffer isn't being consumed, set the last index
1810 // so that fetches < a SIMD wide will be masked off
1811 fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1812 if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1813 {
1814 fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1815 }
1816 }
1817 else
1818 {
1819 fetchInfo_lo.StartVertex = work.startVertex;
1820 }
1821
1822 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1823
1824 const simd16scalari vScale =
1825 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1826
1827 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1828 {
1829 uint32_t i = 0;
1830
1831 simd16scalari vIndex;
1832
1833 if (IsIndexedT::value)
1834 {
1835 fetchInfo_lo.xpIndices = work.xpIB;
1836 fetchInfo_hi.xpIndices =
1837 fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1838 }
1839 else
1840 {
1841 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1842
1843 fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
1844 fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
1845 GetPrivateState(pDC),
1846 &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1847 }
1848
1849 fetchInfo_lo.CurInstance = instanceNum;
1850 fetchInfo_hi.CurInstance = instanceNum;
1851
1852 vsContext_lo.InstanceID = instanceNum;
1853 vsContext_hi.InstanceID = instanceNum;
1854
1855 while (pa.HasWork())
1856 {
1857 // GetNextVsOutput currently has the side effect of updating some PA state machine
1858 // state. So we need to keep this outside of (i < endVertex) check.
1859
1860 simdmask* pvCutIndices_lo = nullptr;
1861 simdmask* pvCutIndices_hi = nullptr;
1862
1863 if (IsIndexedT::value)
1864 {
1865 // simd16mask <=> simdmask[2]
1866
1867 pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
1868 pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
1869 }
1870
1871 simd16vertex& vout = pa.GetNextVsOutput();
1872
1873 vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
1874 vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
1875
1876 if (i < endVertex)
1877 {
1878 if (!IsIndexedT::value)
1879 {
1880 fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1881 uint32_t offset;
1882 offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
1883 offset *= 4; // convert from index to address
1884 #if USE_SIMD16_SHADERS
1885 fetchInfo_lo.xpLastIndex += offset;
1886 #else
1887 fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
1888 uint32_t offset2 =
1889 std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
1890 assert(offset >= 0);
1891 fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1892 fetchInfo_hi.xpLastIndex += offset2;
1893 #endif
1894 }
1895 // 1. Execute FS/VS for a single SIMD.
1896 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
1897 #if USE_SIMD16_SHADERS
1898 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
1899 #else
1900 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
1901
1902 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1903 {
1904 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
1905 }
1906 #endif
1907 RDTSC_END(FEFetchShader, 0);
1908
1909 // forward fetch generated vertex IDs to the vertex shader
1910 #if USE_SIMD16_SHADERS
1911 #if USE_SIMD16_VS
1912 vsContext_lo.VertexID16 =
1913 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1914 vsContext_lo.VertexID16 =
1915 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1916 #else
1917 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1918 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1919 #endif
1920 #else
1921 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1922 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1923 #endif
1924
1925 // Setup active mask for vertex shader.
1926 #if USE_SIMD16_VS
1927 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1928 #else
1929 vsContext_lo.mask = GenerateMask(endVertex - i);
1930 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1931 #endif
1932
1933 // forward cut mask to the PA
1934 if (IsIndexedT::value)
1935 {
1936 #if USE_SIMD16_SHADERS
1937 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1938 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1939 #else
1940 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1941 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1942 #endif
1943 }
1944
1945 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1946
1947 #if KNOB_ENABLE_TOSS_POINTS
1948 if (!KNOB_TOSS_FETCH)
1949 #endif
1950 {
1951 RDTSC_BEGIN(FEVertexShader, pDC->drawId);
1952 #if USE_SIMD16_VS
1953 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1954 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1955 #else
1956 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1957 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1958
1959 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1960 {
1961 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
1962 AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
1963 }
1964 #endif
1965 RDTSC_END(FEVertexShader, 0);
1966
1967 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1968 }
1969 }
1970
1971 // 2. Assemble primitives given the last two SIMD.
1972 do
1973 {
1974 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1975
1976 RDTSC_START(FEPAAssemble);
1977 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1978 RDTSC_STOP(FEPAAssemble, 1, 0);
1979
1980 #if KNOB_ENABLE_TOSS_POINTS
1981 if (!KNOB_TOSS_FETCH)
1982 #endif
1983 {
1984 #if KNOB_ENABLE_TOSS_POINTS
1985 if (!KNOB_TOSS_VS)
1986 #endif
1987 {
1988 if (assemble)
1989 {
1990 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1991
1992 const uint32_t numPrims = pa.NumPrims();
1993 const uint32_t numPrims_lo =
1994 std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1995 const uint32_t numPrims_hi =
1996 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1997
1998 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1999 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
2000 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
2001
2002 if (HasTessellationT::value)
2003 {
2004 pa.useAlternateOffset = false;
2005 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2006 pDC,
2007 workerId,
2008 pa,
2009 &gsBuffers,
2010 pSoPrimData,
2011 numPrims_lo,
2012 primID_lo);
2013
2014 if (numPrims_hi)
2015 {
2016 pa.useAlternateOffset = true;
2017 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2018 pDC,
2019 workerId,
2020 pa,
2021 &gsBuffers,
2022 pSoPrimData,
2023 numPrims_hi,
2024 primID_hi);
2025 }
2026 }
2027 else if (HasGeometryShaderT::value)
2028 {
2029 pa.useAlternateOffset = false;
2030 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2031 workerId,
2032 pa,
2033 &gsBuffers,
2034 pSoPrimData,
2035 numPrims_lo,
2036 primID_lo);
2037
2038 if (numPrims_hi)
2039 {
2040 pa.useAlternateOffset = true;
2041 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2042 workerId,
2043 pa,
2044 &gsBuffers,
2045 pSoPrimData,
2046 numPrims_hi,
2047 primID_hi);
2048 }
2049 }
2050 else
2051 {
2052 // If streamout is enabled then stream vertices out to memory.
2053 if (HasStreamOutT::value)
2054 {
2055 pa.useAlternateOffset = false;
2056 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2057 }
2058
2059 if (HasRastT::value)
2060 {
2061 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
2062 // Gather data from the SVG if provided.
2063 simd16scalari vpai = SIMD16::setzero_si();
2064 simd16scalari rtai = SIMD16::setzero_si();
2065 SIMD16::Vec4 svgAttrib[4];
2066
2067 if (state.backendState.readViewportArrayIndex ||
2068 state.backendState.readRenderTargetArrayIndex)
2069 {
2070 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2071 }
2072
2073 if (state.backendState.readViewportArrayIndex)
2074 {
2075 vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2076 pa.viewportArrayActive = true;
2077 }
2078 if (state.backendState.readRenderTargetArrayIndex)
2079 {
2080 rtai =
2081 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2082 pa.rtArrayActive = true;
2083 }
2084
2085 {
2086 // OOB VPAI indices => forced to zero.
2087 vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
2088 simd16scalari vNumViewports =
2089 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2090 simd16scalari vClearMask =
2091 SIMD16::cmplt_epi32(vpai, vNumViewports);
2092 vpai = SIMD16::and_si(vClearMask, vpai);
2093
2094 pa.useAlternateOffset = false;
2095 pDC->pState->pfnProcessPrims_simd16(pDC,
2096 pa,
2097 workerId,
2098 prim_simd16,
2099 GenMask(numPrims),
2100 primID,
2101 vpai,
2102 rtai);
2103 }
2104 }
2105 }
2106 }
2107 }
2108 }
2109 } while (pa.NextPrim());
2110
2111 if (IsIndexedT::value)
2112 {
2113 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2114 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2115 }
2116 else
2117 {
2118 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
2119 }
2120
2121 i += KNOB_SIMD16_WIDTH;
2122 }
2123
2124 pa.Reset();
2125 }
2126
2127 #else
2128 SWR_VS_CONTEXT vsContext;
2129 SWR_FETCH_CONTEXT fetchInfo = {0};
2130
2131 fetchInfo.pStreams = &state.vertexBuffers[0];
2132 fetchInfo.StartInstance = work.startInstance;
2133 fetchInfo.StartVertex = 0;
2134
2135 if (IsIndexedT::value)
2136 {
2137 fetchInfo.BaseVertex = work.baseVertex;
2138
2139 // if the entire index buffer isn't being consumed, set the last index
2140 // so that fetches < a SIMD wide will be masked off
2141 fetchInfo.pLastIndex =
2142 (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
2143 if (xpLastRequestedIndex < fetchInfo.pLastIndex)
2144 {
2145 fetchInfo.pLastIndex = xpLastRequestedIndex;
2146 }
2147 }
2148 else
2149 {
2150 fetchInfo.StartVertex = work.startVertex;
2151 }
2152
2153 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2154
2155 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2156 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
2157 {
2158 simdscalari vIndex;
2159 uint32_t i = 0;
2160
2161 if (IsIndexedT::value)
2162 {
2163 fetchInfo.pIndices = work.pIB;
2164 }
2165 else
2166 {
2167 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
2168 fetchInfo.pIndices = (const int32_t*)&vIndex;
2169 }
2170
2171 fetchInfo.CurInstance = instanceNum;
2172 vsContext.InstanceID = instanceNum;
2173
2174 while (pa.HasWork())
2175 {
2176 // GetNextVsOutput currently has the side effect of updating some PA state machine
2177 // state. So we need to keep this outside of (i < endVertex) check.
2178 simdmask* pvCutIndices = nullptr;
2179 if (IsIndexedT::value)
2180 {
2181 pvCutIndices = &pa.GetNextVsIndices();
2182 }
2183
2184 simdvertex& vout = pa.GetNextVsOutput();
2185 vsContext.pVin = &vout;
2186 vsContext.pVout = &vout;
2187
2188 if (i < endVertex)
2189 {
2190 // 1. Execute FS/VS for a single SIMD.
2191 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
2192 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
2193 RDTSC_END(FEFetchShader, 0);
2194
2195 // forward fetch generated vertex IDs to the vertex shader
2196 vsContext.VertexID = fetchInfo.VertexID;
2197
2198 // Setup active mask for vertex shader.
2199 vsContext.mask = GenerateMask(endVertex - i);
2200
2201 // forward cut mask to the PA
2202 if (IsIndexedT::value)
2203 {
2204 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2205 }
2206
2207 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2208
2209 #if KNOB_ENABLE_TOSS_POINTS
2210 if (!KNOB_TOSS_FETCH)
2211 #endif
2212 {
2213 RDTSC_BEGIN(FEVertexShader, pDC->drawId);
2214 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
2215 RDTSC_END(FEVertexShader, 0);
2216
2217 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2218 AR_EVENT(VSStats((HANDLE)&vsContext.stats));
2219 }
2220 }
2221
2222 // 2. Assemble primitives given the last two SIMD.
2223 do
2224 {
2225 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2226 // PaAssemble returns false if there is not enough verts to assemble.
2227 RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
2228 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2229 RDTSC_END(FEPAAssemble, 1);
2230
2231 #if KNOB_ENABLE_TOSS_POINTS
2232 if (!KNOB_TOSS_FETCH)
2233 #endif
2234 {
2235 #if KNOB_ENABLE_TOSS_POINTS
2236 if (!KNOB_TOSS_VS)
2237 #endif
2238 {
2239 if (assemble)
2240 {
2241 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2242
2243 if (HasTessellationT::value)
2244 {
2245 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2246 pDC,
2247 workerId,
2248 pa,
2249 &gsBuffers,
2250 pSoPrimData,
2251 pa.GetPrimID(work.startPrimID));
2252 }
2253 else if (HasGeometryShaderT::value)
2254 {
2255 GeometryShaderStage<HasStreamOutT, HasRastT>(
2256 pDC,
2257 workerId,
2258 pa,
2259 &gsBuffers,
2260 pSoPrimData,
2261 pa.GetPrimID(work.startPrimID));
2262 }
2263 else
2264 {
2265 // If streamout is enabled then stream vertices out to memory.
2266 if (HasStreamOutT::value)
2267 {
2268 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2269 }
2270
2271 if (HasRastT::value)
2272 {
2273 SWR_ASSERT(pDC->pState->pfnProcessPrims);
2274
2275 // Gather data from the SVG if provided.
2276 simdscalari vViewportIdx = SIMD::setzero_si();
2277 simdscalari vRtIdx = SIMD::setzero_si();
2278 SIMD::Vec4 svgAttrib[4];
2279
2280 if (state.backendState.readViewportArrayIndex ||
2281 state.backendState.readRenderTargetArrayIndex)
2282 {
2283 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2284 }
2285
2286 if (state.backendState.readViewportArrayIndex)
2287 {
2288 vViewportIdx =
2289 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2290
2291 // OOB VPAI indices => forced to zero.
2292 vViewportIdx =
2293 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2294 simdscalari vNumViewports =
2295 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2296 simdscalari vClearMask =
2297 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2298 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2299 pa.viewportArrayActive = true;
2300 }
2301 if (state.backendState.readRenderTargetArrayIndex)
2302 {
2303 vRtIdx =
2304 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2305 pa.rtArrayActive = true;
2306 }
2307
2308 pDC->pState->pfnProcessPrims(pDC,
2309 pa,
2310 workerId,
2311 prim,
2312 GenMask(pa.NumPrims()),
2313 pa.GetPrimID(work.startPrimID),
2314 vViewportIdx,
2315 vRtIdx);
2316 }
2317 }
2318 }
2319 }
2320 }
2321 } while (pa.NextPrim());
2322
2323 if (IsIndexedT::value)
2324 {
2325 fetchInfo.pIndices =
2326 (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2327 }
2328 else
2329 {
2330 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2331 }
2332
2333 i += KNOB_SIMD_WIDTH;
2334 }
2335 pa.Reset();
2336 }
2337
2338 #endif
2339
2340 RDTSC_END(FEProcessDraw, numPrims * work.numInstances);
2341 }
2342
2343 struct FEDrawChooser
2344 {
2345 typedef PFN_FE_WORK_FUNC FuncType;
2346
2347 template <typename... ArgsB>
2348 static FuncType GetFunc()
2349 {
2350 return ProcessDraw<ArgsB...>;
2351 }
2352 };
2353
2354 // Selector for correct templated Draw front-end function
2355 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
2356 bool IsCutIndexEnabled,
2357 bool HasTessellation,
2358 bool HasGeometryShader,
2359 bool HasStreamOut,
2360 bool HasRasterization)
2361 {
2362 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
2363 IsCutIndexEnabled,
2364 HasTessellation,
2365 HasGeometryShader,
2366 HasStreamOut,
2367 HasRasterization);
2368 }