swr: [rasterizer core] SIMD16 Frontend WIP
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Helper macro to generate a bitmask
45 static INLINE uint32_t GenMask(uint32_t numBits)
46 {
47 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
48 return ((1U << numBits) - 1);
49 }
50
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief FE handler for SwrSync.
53 /// @param pContext - pointer to SWR context.
54 /// @param pDC - pointer to draw context.
55 /// @param workerId - thread's worker id. Even thread has a unique id.
56 /// @param pUserData - Pointer to user data passed back to sync callback.
57 /// @todo This should go away when we switch this to use compute threading.
58 void ProcessSync(
59 SWR_CONTEXT *pContext,
60 DRAW_CONTEXT *pDC,
61 uint32_t workerId,
62 void *pUserData)
63 {
64 BE_WORK work;
65 work.type = SYNC;
66 work.pfnWork = ProcessSyncBE;
67
68 MacroTileMgr *pTileMgr = pDC->pTileMgr;
69 pTileMgr->enqueue(0, 0, &work);
70 }
71
72 //////////////////////////////////////////////////////////////////////////
73 /// @brief FE handler for SwrDestroyContext.
74 /// @param pContext - pointer to SWR context.
75 /// @param pDC - pointer to draw context.
76 /// @param workerId - thread's worker id. Even thread has a unique id.
77 /// @param pUserData - Pointer to user data passed back to sync callback.
78 void ProcessShutdown(
79 SWR_CONTEXT *pContext,
80 DRAW_CONTEXT *pDC,
81 uint32_t workerId,
82 void *pUserData)
83 {
84 BE_WORK work;
85 work.type = SHUTDOWN;
86 work.pfnWork = ProcessShutdownBE;
87
88 MacroTileMgr *pTileMgr = pDC->pTileMgr;
89 // Enqueue at least 1 work item for each worker thread
90 // account for number of numa nodes
91 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
92
93 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
94 {
95 for (uint32_t n = 0; n < numNumaNodes; ++n)
96 {
97 pTileMgr->enqueue(i, n, &work);
98 }
99 }
100 }
101
102 //////////////////////////////////////////////////////////////////////////
103 /// @brief FE handler for SwrClearRenderTarget.
104 /// @param pContext - pointer to SWR context.
105 /// @param pDC - pointer to draw context.
106 /// @param workerId - thread's worker id. Even thread has a unique id.
107 /// @param pUserData - Pointer to user data passed back to clear callback.
108 /// @todo This should go away when we switch this to use compute threading.
109 void ProcessClear(
110 SWR_CONTEXT *pContext,
111 DRAW_CONTEXT *pDC,
112 uint32_t workerId,
113 void *pUserData)
114 {
115 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
116 MacroTileMgr *pTileMgr = pDC->pTileMgr;
117
118 // queue a clear to each macro tile
119 // compute macro tile bounds for the specified rect
120 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
121 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
122 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
123 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
124
125 BE_WORK work;
126 work.type = CLEAR;
127 work.pfnWork = ProcessClearBE;
128 work.desc.clear = *pDesc;
129
130 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
131 {
132 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
133 {
134 pTileMgr->enqueue(x, y, &work);
135 }
136 }
137 }
138
139 //////////////////////////////////////////////////////////////////////////
140 /// @brief FE handler for SwrStoreTiles.
141 /// @param pContext - pointer to SWR context.
142 /// @param pDC - pointer to draw context.
143 /// @param workerId - thread's worker id. Even thread has a unique id.
144 /// @param pUserData - Pointer to user data passed back to callback.
145 /// @todo This should go away when we switch this to use compute threading.
146 void ProcessStoreTiles(
147 SWR_CONTEXT *pContext,
148 DRAW_CONTEXT *pDC,
149 uint32_t workerId,
150 void *pUserData)
151 {
152 AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
153 MacroTileMgr *pTileMgr = pDC->pTileMgr;
154 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
155
156 // queue a store to each macro tile
157 // compute macro tile bounds for the specified rect
158 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
159 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
160 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
161 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
162
163 // store tiles
164 BE_WORK work;
165 work.type = STORETILES;
166 work.pfnWork = ProcessStoreTilesBE;
167 work.desc.storeTiles = *pDesc;
168
169 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
170 {
171 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
172 {
173 pTileMgr->enqueue(x, y, &work);
174 }
175 }
176
177 AR_END(FEProcessStoreTiles, 0);
178 }
179
180 //////////////////////////////////////////////////////////////////////////
181 /// @brief FE handler for SwrInvalidateTiles.
182 /// @param pContext - pointer to SWR context.
183 /// @param pDC - pointer to draw context.
184 /// @param workerId - thread's worker id. Even thread has a unique id.
185 /// @param pUserData - Pointer to user data passed back to callback.
186 /// @todo This should go away when we switch this to use compute threading.
187 void ProcessDiscardInvalidateTiles(
188 SWR_CONTEXT *pContext,
189 DRAW_CONTEXT *pDC,
190 uint32_t workerId,
191 void *pUserData)
192 {
193 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
194 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
195 MacroTileMgr *pTileMgr = pDC->pTileMgr;
196
197 // compute macro tile bounds for the specified rect
198 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
199 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
200 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
201 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
202
203 if (pDesc->fullTilesOnly == false)
204 {
205 // include partial tiles
206 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
207 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
208 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
209 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
210 }
211
212 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
213 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
214
215 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
216 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
217
218 // load tiles
219 BE_WORK work;
220 work.type = DISCARDINVALIDATETILES;
221 work.pfnWork = ProcessDiscardInvalidateTilesBE;
222 work.desc.discardInvalidateTiles = *pDesc;
223
224 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
225 {
226 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
227 {
228 pTileMgr->enqueue(x, y, &work);
229 }
230 }
231
232 AR_END(FEProcessInvalidateTiles, 0);
233 }
234
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Computes the number of primitives given the number of verts.
237 /// @param mode - primitive topology for draw operation.
238 /// @param numPrims - number of vertices or indices for draw.
239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
240 uint32_t GetNumPrims(
241 PRIMITIVE_TOPOLOGY mode,
242 uint32_t numPrims)
243 {
244 switch (mode)
245 {
246 case TOP_POINT_LIST: return numPrims;
247 case TOP_TRIANGLE_LIST: return numPrims / 3;
248 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
249 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
250 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
251 case TOP_QUAD_LIST: return numPrims / 4;
252 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
253 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
254 case TOP_LINE_LIST: return numPrims / 2;
255 case TOP_LINE_LOOP: return numPrims;
256 case TOP_RECT_LIST: return numPrims / 3;
257 case TOP_LINE_LIST_ADJ: return numPrims / 4;
258 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
259 case TOP_TRI_LIST_ADJ: return numPrims / 6;
260 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
261
262 case TOP_PATCHLIST_1:
263 case TOP_PATCHLIST_2:
264 case TOP_PATCHLIST_3:
265 case TOP_PATCHLIST_4:
266 case TOP_PATCHLIST_5:
267 case TOP_PATCHLIST_6:
268 case TOP_PATCHLIST_7:
269 case TOP_PATCHLIST_8:
270 case TOP_PATCHLIST_9:
271 case TOP_PATCHLIST_10:
272 case TOP_PATCHLIST_11:
273 case TOP_PATCHLIST_12:
274 case TOP_PATCHLIST_13:
275 case TOP_PATCHLIST_14:
276 case TOP_PATCHLIST_15:
277 case TOP_PATCHLIST_16:
278 case TOP_PATCHLIST_17:
279 case TOP_PATCHLIST_18:
280 case TOP_PATCHLIST_19:
281 case TOP_PATCHLIST_20:
282 case TOP_PATCHLIST_21:
283 case TOP_PATCHLIST_22:
284 case TOP_PATCHLIST_23:
285 case TOP_PATCHLIST_24:
286 case TOP_PATCHLIST_25:
287 case TOP_PATCHLIST_26:
288 case TOP_PATCHLIST_27:
289 case TOP_PATCHLIST_28:
290 case TOP_PATCHLIST_29:
291 case TOP_PATCHLIST_30:
292 case TOP_PATCHLIST_31:
293 case TOP_PATCHLIST_32:
294 return numPrims / (mode - TOP_PATCHLIST_BASE);
295
296 case TOP_POLYGON:
297 case TOP_POINT_LIST_BF:
298 case TOP_LINE_STRIP_CONT:
299 case TOP_LINE_STRIP_BF:
300 case TOP_LINE_STRIP_CONT_BF:
301 case TOP_TRIANGLE_FAN_NOSTIPPLE:
302 case TOP_TRI_STRIP_REVERSE:
303 case TOP_PATCHLIST_BASE:
304 case TOP_UNKNOWN:
305 SWR_INVALID("Unsupported topology: %d", mode);
306 return 0;
307 }
308
309 return 0;
310 }
311
312 //////////////////////////////////////////////////////////////////////////
313 /// @brief Computes the number of verts given the number of primitives.
314 /// @param mode - primitive topology for draw operation.
315 /// @param numPrims - number of primitives for draw.
316 uint32_t GetNumVerts(
317 PRIMITIVE_TOPOLOGY mode,
318 uint32_t numPrims)
319 {
320 switch (mode)
321 {
322 case TOP_POINT_LIST: return numPrims;
323 case TOP_TRIANGLE_LIST: return numPrims * 3;
324 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
325 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
326 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
327 case TOP_QUAD_LIST: return numPrims * 4;
328 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
329 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
330 case TOP_LINE_LIST: return numPrims * 2;
331 case TOP_LINE_LOOP: return numPrims;
332 case TOP_RECT_LIST: return numPrims * 3;
333 case TOP_LINE_LIST_ADJ: return numPrims * 4;
334 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
335 case TOP_TRI_LIST_ADJ: return numPrims * 6;
336 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
337
338 case TOP_PATCHLIST_1:
339 case TOP_PATCHLIST_2:
340 case TOP_PATCHLIST_3:
341 case TOP_PATCHLIST_4:
342 case TOP_PATCHLIST_5:
343 case TOP_PATCHLIST_6:
344 case TOP_PATCHLIST_7:
345 case TOP_PATCHLIST_8:
346 case TOP_PATCHLIST_9:
347 case TOP_PATCHLIST_10:
348 case TOP_PATCHLIST_11:
349 case TOP_PATCHLIST_12:
350 case TOP_PATCHLIST_13:
351 case TOP_PATCHLIST_14:
352 case TOP_PATCHLIST_15:
353 case TOP_PATCHLIST_16:
354 case TOP_PATCHLIST_17:
355 case TOP_PATCHLIST_18:
356 case TOP_PATCHLIST_19:
357 case TOP_PATCHLIST_20:
358 case TOP_PATCHLIST_21:
359 case TOP_PATCHLIST_22:
360 case TOP_PATCHLIST_23:
361 case TOP_PATCHLIST_24:
362 case TOP_PATCHLIST_25:
363 case TOP_PATCHLIST_26:
364 case TOP_PATCHLIST_27:
365 case TOP_PATCHLIST_28:
366 case TOP_PATCHLIST_29:
367 case TOP_PATCHLIST_30:
368 case TOP_PATCHLIST_31:
369 case TOP_PATCHLIST_32:
370 return numPrims * (mode - TOP_PATCHLIST_BASE);
371
372 case TOP_POLYGON:
373 case TOP_POINT_LIST_BF:
374 case TOP_LINE_STRIP_CONT:
375 case TOP_LINE_STRIP_BF:
376 case TOP_LINE_STRIP_CONT_BF:
377 case TOP_TRIANGLE_FAN_NOSTIPPLE:
378 case TOP_TRI_STRIP_REVERSE:
379 case TOP_PATCHLIST_BASE:
380 case TOP_UNKNOWN:
381 SWR_INVALID("Unsupported topology: %d", mode);
382 return 0;
383 }
384
385 return 0;
386 }
387
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Return number of verts per primitive.
390 /// @param topology - topology
391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
393 {
394 uint32_t numVerts = 0;
395 switch (topology)
396 {
397 case TOP_POINT_LIST:
398 case TOP_POINT_LIST_BF:
399 numVerts = 1;
400 break;
401 case TOP_LINE_LIST:
402 case TOP_LINE_STRIP:
403 case TOP_LINE_LIST_ADJ:
404 case TOP_LINE_LOOP:
405 case TOP_LINE_STRIP_CONT:
406 case TOP_LINE_STRIP_BF:
407 case TOP_LISTSTRIP_ADJ:
408 numVerts = 2;
409 break;
410 case TOP_TRIANGLE_LIST:
411 case TOP_TRIANGLE_STRIP:
412 case TOP_TRIANGLE_FAN:
413 case TOP_TRI_LIST_ADJ:
414 case TOP_TRI_STRIP_ADJ:
415 case TOP_TRI_STRIP_REVERSE:
416 case TOP_RECT_LIST:
417 numVerts = 3;
418 break;
419 case TOP_QUAD_LIST:
420 case TOP_QUAD_STRIP:
421 numVerts = 4;
422 break;
423 case TOP_PATCHLIST_1:
424 case TOP_PATCHLIST_2:
425 case TOP_PATCHLIST_3:
426 case TOP_PATCHLIST_4:
427 case TOP_PATCHLIST_5:
428 case TOP_PATCHLIST_6:
429 case TOP_PATCHLIST_7:
430 case TOP_PATCHLIST_8:
431 case TOP_PATCHLIST_9:
432 case TOP_PATCHLIST_10:
433 case TOP_PATCHLIST_11:
434 case TOP_PATCHLIST_12:
435 case TOP_PATCHLIST_13:
436 case TOP_PATCHLIST_14:
437 case TOP_PATCHLIST_15:
438 case TOP_PATCHLIST_16:
439 case TOP_PATCHLIST_17:
440 case TOP_PATCHLIST_18:
441 case TOP_PATCHLIST_19:
442 case TOP_PATCHLIST_20:
443 case TOP_PATCHLIST_21:
444 case TOP_PATCHLIST_22:
445 case TOP_PATCHLIST_23:
446 case TOP_PATCHLIST_24:
447 case TOP_PATCHLIST_25:
448 case TOP_PATCHLIST_26:
449 case TOP_PATCHLIST_27:
450 case TOP_PATCHLIST_28:
451 case TOP_PATCHLIST_29:
452 case TOP_PATCHLIST_30:
453 case TOP_PATCHLIST_31:
454 case TOP_PATCHLIST_32:
455 numVerts = topology - TOP_PATCHLIST_BASE;
456 break;
457 default:
458 SWR_INVALID("Unsupported topology: %d", topology);
459 break;
460 }
461
462 if (includeAdjVerts)
463 {
464 switch (topology)
465 {
466 case TOP_LISTSTRIP_ADJ:
467 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
468 case TOP_TRI_STRIP_ADJ:
469 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
470 default: break;
471 }
472 }
473
474 return numVerts;
475 }
476
477 //////////////////////////////////////////////////////////////////////////
478 /// @brief Generate mask from remaining work.
479 /// @param numWorkItems - Number of items being worked on by a SIMD.
480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
481 {
482 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
483 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
484 return _simd_castps_si(vMask(mask));
485 }
486
487 //////////////////////////////////////////////////////////////////////////
488 /// @brief StreamOut - Streams vertex data out to SO buffers.
489 /// Generally, we are only streaming out a SIMDs worth of triangles.
490 /// @param pDC - pointer to draw context.
491 /// @param workerId - thread's worker id. Even thread has a unique id.
492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
493 static void StreamOut(
494 DRAW_CONTEXT* pDC,
495 PA_STATE& pa,
496 uint32_t workerId,
497 uint32_t* pPrimData,
498 #if USE_SIMD16_FRONTEND
499 uint32_t numPrims_simd8,
500 #endif
501 uint32_t streamIndex)
502 {
503 SWR_CONTEXT *pContext = pDC->pContext;
504
505 AR_BEGIN(FEStreamout, pDC->drawId);
506
507 const API_STATE& state = GetApiState(pDC);
508 const SWR_STREAMOUT_STATE &soState = state.soState;
509
510 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
511
512 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
513 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
514
515 SWR_STREAMOUT_CONTEXT soContext = { 0 };
516
517 // Setup buffer state pointers.
518 for (uint32_t i = 0; i < 4; ++i)
519 {
520 soContext.pBuffer[i] = &state.soBuffer[i];
521 }
522
523 #if USE_SIMD16_FRONTEND
524 uint32_t numPrims = numPrims_simd8;
525 #else
526 uint32_t numPrims = pa.NumPrims();
527 #endif
528
529 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
530 {
531 DWORD slot = 0;
532 uint32_t soMask = soState.streamMasks[streamIndex];
533
534 // Write all entries into primitive data buffer for SOS.
535 while (_BitScanForward(&slot, soMask))
536 {
537 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
538 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
539 pa.AssembleSingle(paSlot, primIndex, attrib);
540
541 // Attribute offset is relative offset from start of vertex.
542 // Note that attributes start at slot 1 in the PA buffer. We need to write this
543 // to prim data starting at slot 0. Which is why we do (slot - 1).
544 // Also note: GL works slightly differently, and needs slot 0
545 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
546
547 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
548 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
549 {
550 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
551
552 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
553 }
554 soMask &= ~(1 << slot);
555 }
556
557 // Update pPrimData pointer
558 soContext.pPrimData = pPrimData;
559
560 // Call SOS
561 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
562 state.pfnSoFunc[streamIndex](soContext);
563 }
564
565 // Update SO write offset. The driver provides memory for the update.
566 for (uint32_t i = 0; i < 4; ++i)
567 {
568 if (state.soBuffer[i].pWriteOffset)
569 {
570 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
571 }
572
573 if (state.soBuffer[i].soWriteEnable)
574 {
575 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
576 pDC->dynState.SoWriteOffsetDirty[i] = true;
577 }
578 }
579
580 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
581 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
582
583 AR_END(FEStreamout, 1);
584 }
585
586 #if USE_SIMD16_FRONTEND
587 //////////////////////////////////////////////////////////////////////////
588 /// Is value an even number (a multiple of two)
589 ///
590 template <typename T>
591 INLINE static bool IsEven(T value)
592 {
593 return (value & 1) == 0;
594 }
595
596 //////////////////////////////////////////////////////////////////////////
597 /// Round up value to an even number (a multiple of two)
598 ///
599 template <typename T>
600 INLINE static T RoundUpEven(T value)
601 {
602 return (value + 1) & ~1;
603 }
604
605 //////////////////////////////////////////////////////////////////////////
606 /// Round down value to an even number (a multiple of two)
607 ///
608 template <typename T>
609 INLINE static T RoundDownEven(T value)
610 {
611 return value & ~1;
612 }
613
614 //////////////////////////////////////////////////////////////////////////
615 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
616 ///
617 /// vertexCount is in terms of the source simdvertexes and must be even
618 ///
619 /// attribCount will limit the vector copies to those attribs specified
620 ///
621 /// note: the stride between vertexes is determinded by KNOB_NUM_ATTRIBUTES
622 ///
623 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
624 {
625 SWR_ASSERT(vertex);
626 SWR_ASSERT(vertex_simd16);
627 SWR_ASSERT(attribCount <= KNOB_NUM_ATTRIBUTES);
628
629 simd16vertex temp;
630
631 for (uint32_t i = 0; i < vertexCount; i += 2)
632 {
633 for (uint32_t j = 0; j < attribCount; j += 1)
634 {
635 for (uint32_t k = 0; k < 4; k += 1)
636 {
637 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
638
639 if ((i + 1) < vertexCount)
640 {
641 temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
642 }
643 }
644 }
645
646 for (uint32_t j = 0; j < attribCount; j += 1)
647 {
648 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
649 }
650 }
651 }
652
653 #endif
654 //////////////////////////////////////////////////////////////////////////
655 /// @brief Computes number of invocations. The current index represents
656 /// the start of the SIMD. The max index represents how much work
657 /// items are remaining. If there is less then a SIMD's xmin of work
658 /// then return the remaining amount of work.
659 /// @param curIndex - The start index for the SIMD.
660 /// @param maxIndex - The last index for all work items.
661 static INLINE uint32_t GetNumInvocations(
662 uint32_t curIndex,
663 uint32_t maxIndex)
664 {
665 uint32_t remainder = (maxIndex - curIndex);
666 #if USE_SIMD16_FRONTEND
667 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
668 #else
669 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
670 #endif
671 }
672
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
675 /// The geometry shader will loop over each active streamout buffer, assembling
676 /// primitives for the downstream stages. When multistream output is enabled,
677 /// the generated stream ID buffer from the GS needs to be converted to a cut
678 /// buffer for the primitive assembler.
679 /// @param stream - stream id to generate the cut buffer for
680 /// @param pStreamIdBase - pointer to the stream ID buffer
681 /// @param numEmittedVerts - Number of total verts emitted by the GS
682 /// @param pCutBuffer - output buffer to write cuts to
683 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
684 {
685 SWR_ASSERT(stream < MAX_SO_STREAMS);
686
687 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
688 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
689
690 for (uint32_t b = 0; b < numOutputBytes; ++b)
691 {
692 uint8_t curInputByte = pStreamIdBase[2*b];
693 uint8_t outByte = 0;
694 for (uint32_t i = 0; i < 4; ++i)
695 {
696 if ((curInputByte & 0x3) != stream)
697 {
698 outByte |= (1 << i);
699 }
700 curInputByte >>= 2;
701 }
702
703 curInputByte = pStreamIdBase[2 * b + 1];
704 for (uint32_t i = 0; i < 4; ++i)
705 {
706 if ((curInputByte & 0x3) != stream)
707 {
708 outByte |= (1 << (i + 4));
709 }
710 curInputByte >>= 2;
711 }
712
713 *pCutBuffer++ = outByte;
714 }
715 }
716
717 THREAD SWR_GS_CONTEXT tlsGsContext;
718
719 #if USE_SIMD16_FRONTEND
720 THREAD simd16vertex tempVertex_simd16[128];
721
722 #endif
723 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
724 struct GsBufferInfo
725 {
726 GsBufferInfo(const SWR_GS_STATE &gsState)
727 {
728 const uint32_t vertexCount = gsState.maxNumVerts;
729 const uint32_t vertexStride = sizeof(SIMDVERTEX);
730 const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
731
732 vertexPrimitiveStride = vertexStride * numSimdBatches;
733 vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
734
735 if (gsState.isSingleStream)
736 {
737 cutPrimitiveStride = (vertexCount + 7) / 8;
738 cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
739
740 streamCutPrimitiveStride = 0;
741 streamCutInstanceStride = 0;
742 }
743 else
744 {
745 cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
746 cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
747
748 streamCutPrimitiveStride = (vertexCount + 7) / 8;
749 streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
750 }
751 }
752
753 uint32_t vertexPrimitiveStride;
754 uint32_t vertexInstanceStride;
755
756 uint32_t cutPrimitiveStride;
757 uint32_t cutInstanceStride;
758
759 uint32_t streamCutPrimitiveStride;
760 uint32_t streamCutInstanceStride;
761 };
762
763 //////////////////////////////////////////////////////////////////////////
764 /// @brief Implements GS stage.
765 /// @param pDC - pointer to draw context.
766 /// @param workerId - thread's worker id. Even thread has a unique id.
767 /// @param pa - The primitive assembly object.
768 /// @param pGsOut - output stream for GS
769 template <
770 typename HasStreamOutT,
771 typename HasRastT>
772 static void GeometryShaderStage(
773 DRAW_CONTEXT *pDC,
774 uint32_t workerId,
775 PA_STATE& pa,
776 void* pGsOut,
777 void* pCutBuffer,
778 void* pStreamCutBuffer,
779 uint32_t* pSoPrimData,
780 #if USE_SIMD16_FRONTEND
781 uint32_t numPrims_simd8,
782 #endif
783 simdscalari primID)
784 {
785 SWR_CONTEXT *pContext = pDC->pContext;
786
787 AR_BEGIN(FEGeometryShader, pDC->drawId);
788
789 const API_STATE& state = GetApiState(pDC);
790 const SWR_GS_STATE* pState = &state.gsState;
791
792 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
793 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
794
795 tlsGsContext.pStream = (uint8_t*)pGsOut;
796 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
797 tlsGsContext.PrimitiveID = primID;
798
799 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
800 simdvector attrib[MAX_ATTRIBUTES];
801
802 // assemble all attributes for the input primitive
803 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
804 {
805 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
806 pa.Assemble(attribSlot, attrib);
807
808 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
809 {
810 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
811 }
812 }
813
814 // assemble position
815 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
816 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
817 {
818 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
819 }
820
821 const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
822
823 // record valid prims from the frontend to avoid over binning the newly generated
824 // prims from the GS
825 #if USE_SIMD16_FRONTEND
826 uint32_t numInputPrims = numPrims_simd8;
827 #else
828 uint32_t numInputPrims = pa.NumPrims();
829 #endif
830
831 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
832 {
833 tlsGsContext.InstanceID = instance;
834 tlsGsContext.mask = GenerateMask(numInputPrims);
835
836 // execute the geometry shader
837 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
838
839 tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
840 tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
841 }
842
843 // set up new binner and state for the GS output topology
844 #if USE_SIMD16_FRONTEND
845 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
846 if (HasRastT::value)
847 {
848 switch (pState->outputTopology)
849 {
850 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break;
851 case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break;
852 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
853 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
854 }
855 }
856
857 #else
858 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
859 if (HasRastT::value)
860 {
861 switch (pState->outputTopology)
862 {
863 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
864 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
865 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
866 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
867 }
868 }
869
870 #endif
871 // foreach input prim:
872 // - setup a new PA based on the emitted verts for that prim
873 // - loop over the new verts, calling PA to assemble each prim
874 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
875 uint32_t* pPrimitiveId = (uint32_t*)&primID;
876
877 uint32_t totalPrimsGenerated = 0;
878 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
879 {
880 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
881 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
882
883 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
884 {
885 uint32_t numEmittedVerts = pVertexCount[inputPrim];
886 if (numEmittedVerts == 0)
887 {
888 continue;
889 }
890
891 uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
892 uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
893
894 uint32_t numAttribs = state.feNumAttributes;
895
896 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
897 {
898 bool processCutVerts = false;
899
900 uint8_t* pCutBuffer = pCutBase;
901
902 // assign default stream ID, only relevant when GS is outputting a single stream
903 uint32_t streamID = 0;
904 if (pState->isSingleStream)
905 {
906 processCutVerts = true;
907 streamID = pState->singleStreamID;
908 if (streamID != stream) continue;
909 }
910 else
911 {
912 // early exit if this stream is not enabled for streamout
913 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
914 {
915 continue;
916 }
917
918 // multi-stream output, need to translate StreamID buffer to a cut buffer
919 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
920 pCutBuffer = (uint8_t*)pStreamCutBuffer;
921 processCutVerts = false;
922 }
923
924 #if USE_SIMD16_FRONTEND
925 // TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex
926
927 SWR_ASSERT(numEmittedVerts <= 256);
928
929 PackPairsOfSimdVertexIntoSimd16Vertex(
930 tempVertex_simd16,
931 reinterpret_cast<const simdvertex *>(pBase),
932 numEmittedVerts,
933 KNOB_NUM_ATTRIBUTES);
934
935 #endif
936 #if USE_SIMD16_FRONTEND
937 PA_STATE_CUT gsPa(pDC, reinterpret_cast<uint8_t *>(tempVertex_simd16), numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
938
939 #else
940 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
941
942 #endif
943 while (gsPa.GetNextStreamOutput())
944 {
945 do
946 {
947 #if USE_SIMD16_FRONTEND
948 simd16vector attrib_simd16[3];
949
950 bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
951
952 #else
953 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
954
955 #endif
956 if (assemble)
957 {
958 totalPrimsGenerated += gsPa.NumPrims();
959
960 if (HasStreamOutT::value)
961 {
962 #if USE_SIMD16_FRONTEND
963 const uint32_t numPrims = gsPa.NumPrims();
964 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
965 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
966
967 gsPa.useAlternateOffset = false;
968 StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream);
969
970 if (numPrims_hi)
971 {
972 gsPa.useAlternateOffset = true;
973 StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream);
974 }
975 #else
976 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
977 #endif
978 }
979
980 if (HasRastT::value && state.soState.streamToRasterizer == stream)
981 {
982 #if USE_SIMD16_FRONTEND
983 simd16scalari vPrimId;
984 // pull primitiveID from the GS output if available
985 if (state.gsState.emitsPrimitiveID)
986 {
987 simd16vector primIdAttrib[3];
988 gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
989 vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
990 }
991 else
992 {
993 vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
994 }
995
996 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
997 simd16scalari vViewPortIdx;
998 if (state.gsState.emitsViewportArrayIndex)
999 {
1000 simd16vector vpiAttrib[3];
1001 gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
1002
1003 // OOB indices => forced to zero.
1004 simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1005 simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
1006 vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
1007
1008 vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
1009 }
1010 else
1011 {
1012 vViewPortIdx = _simd16_set1_epi32(0);
1013 }
1014
1015 gsPa.useAlternateOffset = false;
1016 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
1017 #else
1018 simdscalari vPrimId;
1019 // pull primitiveID from the GS output if available
1020 if (state.gsState.emitsPrimitiveID)
1021 {
1022 simdvector primIdAttrib[3];
1023 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
1024 vPrimId = _simd_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
1025 }
1026 else
1027 {
1028 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1029 }
1030
1031 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
1032 simdscalari vViewPortIdx;
1033 if (state.gsState.emitsViewportArrayIndex)
1034 {
1035 simdvector vpiAttrib[3];
1036 gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
1037
1038 // OOB indices => forced to zero.
1039 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1040 simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
1041 vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
1042
1043 vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
1044 }
1045 else
1046 {
1047 vViewPortIdx = _simd_set1_epi32(0);
1048 }
1049
1050 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
1051 #endif
1052 }
1053 }
1054 } while (gsPa.NextPrim());
1055 }
1056 }
1057 }
1058 }
1059
1060 // update GS pipeline stats
1061 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1062 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1063 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1064 AR_END(FEGeometryShader, 1);
1065 }
1066
1067 //////////////////////////////////////////////////////////////////////////
1068 /// @brief Allocate GS buffers
1069 /// @param pDC - pointer to draw context.
1070 /// @param state - API state
1071 /// @param ppGsOut - pointer to GS output buffer allocation
1072 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1073 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
1074 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
1075 void **ppStreamCutBuffer)
1076 {
1077 auto pArena = pDC->pArena;
1078 SWR_ASSERT(pArena != nullptr);
1079 SWR_ASSERT(state.gsState.gsEnable);
1080
1081 // allocate arena space to hold GS output verts
1082 // @todo pack attribs
1083 // @todo support multiple streams
1084
1085 const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
1086
1087 const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
1088
1089 *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
1090
1091 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
1092 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
1093
1094 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
1095 if (state.gsState.isSingleStream)
1096 {
1097 const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1098
1099 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1100 *ppStreamCutBuffer = nullptr;
1101 }
1102 else
1103 {
1104 const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1105 const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
1106
1107 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1108 *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
1109 }
1110 }
1111
1112 //////////////////////////////////////////////////////////////////////////
1113 /// @brief Contains all data generated by the HS and passed to the
1114 /// tessellator and DS.
1115 struct TessellationThreadLocalData
1116 {
1117 SWR_HS_CONTEXT hsContext;
1118 ScalarPatch patchData[KNOB_SIMD_WIDTH];
1119 void* pTxCtx;
1120 size_t tsCtxSize;
1121
1122 simdscalar* pDSOutput;
1123 size_t numDSOutputVectors;
1124 };
1125
1126 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1127
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief Allocate tessellation data for this worker thread.
1130 INLINE
1131 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1132 {
1133 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1134 if (gt_pTessellationThreadData == nullptr)
1135 {
1136 gt_pTessellationThreadData = (TessellationThreadLocalData*)
1137 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1138 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1139 }
1140 }
1141
1142 //////////////////////////////////////////////////////////////////////////
1143 /// @brief Implements Tessellation Stages.
1144 /// @param pDC - pointer to draw context.
1145 /// @param workerId - thread's worker id. Even thread has a unique id.
1146 /// @param pa - The primitive assembly object.
1147 /// @param pGsOut - output stream for GS
1148 template <
1149 typename HasGeometryShaderT,
1150 typename HasStreamOutT,
1151 typename HasRastT>
1152 static void TessellationStages(
1153 DRAW_CONTEXT *pDC,
1154 uint32_t workerId,
1155 PA_STATE& pa,
1156 void* pGsOut,
1157 void* pCutBuffer,
1158 void* pCutStreamBuffer,
1159 uint32_t* pSoPrimData,
1160 #if USE_SIMD16_FRONTEND
1161 uint32_t numPrims_simd8,
1162 #endif
1163 simdscalari primID)
1164 {
1165 SWR_CONTEXT *pContext = pDC->pContext;
1166 const API_STATE& state = GetApiState(pDC);
1167 const SWR_TS_STATE& tsState = state.tsState;
1168
1169 SWR_ASSERT(gt_pTessellationThreadData);
1170
1171 HANDLE tsCtx = TSInitCtx(
1172 tsState.domain,
1173 tsState.partitioning,
1174 tsState.tsOutputTopology,
1175 gt_pTessellationThreadData->pTxCtx,
1176 gt_pTessellationThreadData->tsCtxSize);
1177 if (tsCtx == nullptr)
1178 {
1179 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1180 tsCtx = TSInitCtx(
1181 tsState.domain,
1182 tsState.partitioning,
1183 tsState.tsOutputTopology,
1184 gt_pTessellationThreadData->pTxCtx,
1185 gt_pTessellationThreadData->tsCtxSize);
1186 }
1187 SWR_ASSERT(tsCtx);
1188
1189 #if USE_SIMD16_FRONTEND
1190 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1191 if (HasRastT::value)
1192 {
1193 switch (tsState.postDSTopology)
1194 {
1195 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1196 case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break;
1197 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
1198 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1199 }
1200 }
1201
1202 #else
1203 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1204 if (HasRastT::value)
1205 {
1206 switch (tsState.postDSTopology)
1207 {
1208 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1209 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1210 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1211 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1212 }
1213 }
1214
1215 #endif
1216 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1217 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1218 hsContext.PrimitiveID = primID;
1219
1220 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1221 // Max storage for one attribute for an entire simdprimitive
1222 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1223
1224 // assemble all attributes for the input primitives
1225 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1226 {
1227 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1228 pa.Assemble(attribSlot, simdattrib);
1229
1230 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1231 {
1232 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1233 }
1234 }
1235
1236 #if defined(_DEBUG)
1237 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1238 #endif
1239
1240 #if USE_SIMD16_FRONTEND
1241 uint32_t numPrims = numPrims_simd8;
1242 #else
1243 uint32_t numPrims = pa.NumPrims();
1244 #endif
1245 hsContext.mask = GenerateMask(numPrims);
1246
1247 // Run the HS
1248 AR_BEGIN(FEHullShader, pDC->drawId);
1249 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1250 AR_END(FEHullShader, 0);
1251
1252 UPDATE_STAT_FE(HsInvocations, numPrims);
1253
1254 const uint32_t* pPrimId = (const uint32_t*)&primID;
1255
1256 for (uint32_t p = 0; p < numPrims; ++p)
1257 {
1258 // Run Tessellator
1259 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1260 AR_BEGIN(FETessellation, pDC->drawId);
1261 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1262 AR_EVENT(TessPrimCount(1));
1263 AR_END(FETessellation, 0);
1264
1265 if (tsData.NumPrimitives == 0)
1266 {
1267 continue;
1268 }
1269 SWR_ASSERT(tsData.NumDomainPoints);
1270
1271 // Allocate DS Output memory
1272 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1273 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1274 #if USE_SIMD16_FRONTEND
1275 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
1276 #else
1277 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1278 #endif
1279 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1280 {
1281 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1282 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1283 #if USE_SIMD16_FRONTEND
1284 gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
1285 #else
1286 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1287 #endif
1288 }
1289 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1290 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1291
1292 #if defined(_DEBUG)
1293 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1294 #endif
1295
1296 // Run Domain Shader
1297 SWR_DS_CONTEXT dsContext;
1298 dsContext.PrimitiveID = pPrimId[p];
1299 dsContext.pCpIn = &hsContext.pCPout[p];
1300 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1301 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1302 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1303 #if USE_SIMD16_FRONTEND
1304 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1305 #else
1306 dsContext.vectorStride = requiredDSVectorInvocations;
1307 #endif
1308
1309 uint32_t dsInvocations = 0;
1310
1311 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1312 {
1313 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1314
1315 AR_BEGIN(FEDomainShader, pDC->drawId);
1316 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1317 AR_END(FEDomainShader, 0);
1318
1319 dsInvocations += KNOB_SIMD_WIDTH;
1320 }
1321 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1322
1323 #if USE_SIMD16_FRONTEND
1324 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1325
1326 #endif
1327 PA_TESS tessPa(
1328 pDC,
1329 #if USE_SIMD16_FRONTEND
1330 reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
1331 dsContext.vectorStride / 2, // simd8 -> simd16
1332 #else
1333 dsContext.pOutputData,
1334 dsContext.vectorStride,
1335 #endif
1336 tsState.numDsOutputAttribs,
1337 tsData.ppIndices,
1338 tsData.NumPrimitives,
1339 tsState.postDSTopology);
1340
1341 while (tessPa.HasWork())
1342 {
1343 #if USE_SIMD16_FRONTEND
1344 const uint32_t numPrims = tessPa.NumPrims();
1345 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1346 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1347
1348 const uint32_t primMask = GenMask(numPrims);
1349 const uint32_t primMask_lo = primMask & 255;
1350 const uint32_t primMask_hi = (primMask >> 8) & 255;
1351
1352 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1353 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1354 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1355
1356 #endif
1357 if (HasGeometryShaderT::value)
1358 {
1359 #if USE_SIMD16_FRONTEND
1360 tessPa.useAlternateOffset = false;
1361 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
1362
1363 if (numPrims_hi)
1364 {
1365 tessPa.useAlternateOffset = true;
1366 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
1367 }
1368 #else
1369 GeometryShaderStage<HasStreamOutT, HasRastT>(
1370 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1371 _simd_set1_epi32(dsContext.PrimitiveID));
1372 #endif
1373 }
1374 else
1375 {
1376 if (HasStreamOutT::value)
1377 {
1378 #if USE_SIMD16_FRONTEND
1379 tessPa.useAlternateOffset = false;
1380 StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0);
1381
1382 if (numPrims_hi)
1383 {
1384 tessPa.useAlternateOffset = true;
1385 StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0);
1386 }
1387 #else
1388 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1389 #endif
1390 }
1391
1392 if (HasRastT::value)
1393 {
1394 simdvector prim[3]; // Only deal with triangles, lines, or points
1395 #if USE_SIMD16_FRONTEND
1396 simd16vector prim_simd16[3];
1397 #endif
1398 AR_BEGIN(FEPAAssemble, pDC->drawId);
1399 bool assemble =
1400 #if USE_SIMD16_FRONTEND
1401 tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1402 #else
1403 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1404 #endif
1405 AR_END(FEPAAssemble, 1);
1406 SWR_ASSERT(assemble);
1407
1408 SWR_ASSERT(pfnClipFunc);
1409 #if USE_SIMD16_FRONTEND
1410 tessPa.useAlternateOffset = false;
1411 pfnClipFunc(pDC, tessPa, workerId, prim_simd16, primMask, primID, _simd16_set1_epi32(0));
1412 #else
1413 pfnClipFunc(pDC, tessPa, workerId, prim,
1414 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1415 #endif
1416 }
1417 }
1418
1419 tessPa.NextPrim();
1420
1421 } // while (tessPa.HasWork())
1422 } // for (uint32_t p = 0; p < numPrims; ++p)
1423
1424 TSDestroyCtx(tsCtx);
1425 }
1426
1427 //////////////////////////////////////////////////////////////////////////
1428 /// @brief FE handler for SwrDraw.
1429 /// @tparam IsIndexedT - Is indexed drawing enabled
1430 /// @tparam HasTessellationT - Is tessellation enabled
1431 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1432 /// @tparam HasStreamOutT - Is stream-out enabled
1433 /// @tparam HasRastT - Is rasterization enabled
1434 /// @param pContext - pointer to SWR context.
1435 /// @param pDC - pointer to draw context.
1436 /// @param workerId - thread's worker id.
1437 /// @param pUserData - Pointer to DRAW_WORK
1438 template <
1439 typename IsIndexedT,
1440 typename IsCutIndexEnabledT,
1441 typename HasTessellationT,
1442 typename HasGeometryShaderT,
1443 typename HasStreamOutT,
1444 typename HasRastT>
1445 void ProcessDraw(
1446 SWR_CONTEXT *pContext,
1447 DRAW_CONTEXT *pDC,
1448 uint32_t workerId,
1449 void *pUserData)
1450 {
1451
1452 #if KNOB_ENABLE_TOSS_POINTS
1453 if (KNOB_TOSS_QUEUE_FE)
1454 {
1455 return;
1456 }
1457 #endif
1458
1459 AR_BEGIN(FEProcessDraw, pDC->drawId);
1460
1461 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1462 const API_STATE& state = GetApiState(pDC);
1463
1464 uint32_t indexSize = 0;
1465 uint32_t endVertex = work.numVerts;
1466
1467 const int32_t* pLastRequestedIndex = nullptr;
1468 if (IsIndexedT::value)
1469 {
1470 switch (work.type)
1471 {
1472 case R32_UINT:
1473 indexSize = sizeof(uint32_t);
1474 pLastRequestedIndex = &(work.pIB[endVertex]);
1475 break;
1476 case R16_UINT:
1477 indexSize = sizeof(uint16_t);
1478 // nasty address offset to last index
1479 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1480 break;
1481 case R8_UINT:
1482 indexSize = sizeof(uint8_t);
1483 // nasty address offset to last index
1484 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1485 break;
1486 default:
1487 SWR_INVALID("Invalid work.type: %d", work.type);
1488 }
1489 }
1490 else
1491 {
1492 // No cuts, prune partial primitives.
1493 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1494 }
1495
1496 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1497 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1498 #endif
1499
1500 void* pGsOut = nullptr;
1501 void* pCutBuffer = nullptr;
1502 void* pStreamCutBuffer = nullptr;
1503 if (HasGeometryShaderT::value)
1504 {
1505 #if USE_SIMD16_FRONTEND
1506 AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1507 #else
1508 AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1509 #endif
1510 }
1511
1512 if (HasTessellationT::value)
1513 {
1514 SWR_ASSERT(state.tsState.tsEnable == true);
1515 SWR_ASSERT(state.pfnHsFunc != nullptr);
1516 SWR_ASSERT(state.pfnDsFunc != nullptr);
1517
1518 AllocateTessellationData(pContext);
1519 }
1520 else
1521 {
1522 SWR_ASSERT(state.tsState.tsEnable == false);
1523 SWR_ASSERT(state.pfnHsFunc == nullptr);
1524 SWR_ASSERT(state.pfnDsFunc == nullptr);
1525 }
1526
1527 // allocate space for streamout input prim data
1528 uint32_t* pSoPrimData = nullptr;
1529 if (HasStreamOutT::value)
1530 {
1531 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1532 }
1533
1534 // choose primitive assembler
1535 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1536 PA_STATE& pa = paFactory.GetPA();
1537
1538 #if USE_SIMD16_FRONTEND
1539 simdvertex vin_lo;
1540 simdvertex vin_hi;
1541 SWR_VS_CONTEXT vsContext_lo;
1542 SWR_VS_CONTEXT vsContext_hi;
1543
1544 vsContext_lo.pVin = &vin_lo;
1545 vsContext_hi.pVin = &vin_hi;
1546 vsContext_lo.AlternateOffset = 0;
1547 vsContext_hi.AlternateOffset = 1;
1548
1549 SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
1550
1551 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1552 fetchInfo_lo.StartInstance = work.startInstance;
1553 fetchInfo_lo.StartVertex = 0;
1554
1555 if (IsIndexedT::value)
1556 {
1557 fetchInfo_lo.BaseVertex = work.baseVertex;
1558
1559 // if the entire index buffer isn't being consumed, set the last index
1560 // so that fetches < a SIMD wide will be masked off
1561 fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1562 if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1563 {
1564 fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1565 }
1566 }
1567 else
1568 {
1569 fetchInfo_lo.StartVertex = work.startVertex;
1570 }
1571
1572 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1573
1574 const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1575
1576 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1577 {
1578 uint32_t i = 0;
1579
1580 simd16scalari vIndex;
1581
1582 if (IsIndexedT::value)
1583 {
1584 fetchInfo_lo.pIndices = work.pIB;
1585 fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH
1586 }
1587 else
1588 {
1589 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1590
1591 fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
1592 fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1593 }
1594
1595 fetchInfo_lo.CurInstance = instanceNum;
1596 fetchInfo_hi.CurInstance = instanceNum;
1597
1598 vsContext_lo.InstanceID = instanceNum;
1599 vsContext_hi.InstanceID = instanceNum;
1600
1601 while (pa.HasWork())
1602 {
1603 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1604 // So we need to keep this outside of (i < endVertex) check.
1605
1606 simdmask *pvCutIndices_lo = nullptr;
1607 simdmask *pvCutIndices_hi = nullptr;
1608
1609 if (IsIndexedT::value)
1610 {
1611 // simd16mask <=> simdmask[2]
1612
1613 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1614 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1615 }
1616
1617 simd16vertex &vout = pa.GetNextVsOutput();
1618
1619 vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1620 vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1621
1622 if (i < endVertex)
1623 {
1624 // 1. Execute FS/VS for a single SIMD.
1625 AR_BEGIN(FEFetchShader, pDC->drawId);
1626 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
1627
1628 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1629 {
1630 state.pfnFetchFunc(fetchInfo_hi, vin_hi);
1631 }
1632 AR_END(FEFetchShader, 0);
1633
1634 // forward fetch generated vertex IDs to the vertex shader
1635 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1636 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1637
1638 // Setup active mask for vertex shader.
1639 vsContext_lo.mask = GenerateMask(endVertex - i);
1640 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1641
1642 // forward cut mask to the PA
1643 if (IsIndexedT::value)
1644 {
1645 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1646 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1647 }
1648
1649 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1650
1651 #if KNOB_ENABLE_TOSS_POINTS
1652 if (!KNOB_TOSS_FETCH)
1653 #endif
1654 {
1655 AR_BEGIN(FEVertexShader, pDC->drawId);
1656 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1657
1658 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1659 {
1660 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1661 }
1662 AR_END(FEVertexShader, 0);
1663
1664 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1665 }
1666 }
1667
1668 // 2. Assemble primitives given the last two SIMD.
1669 do
1670 {
1671 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1672
1673 RDTSC_START(FEPAAssemble);
1674 bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1675 RDTSC_STOP(FEPAAssemble, 1, 0);
1676
1677 #if KNOB_ENABLE_TOSS_POINTS
1678 if (!KNOB_TOSS_FETCH)
1679 #endif
1680 {
1681 #if KNOB_ENABLE_TOSS_POINTS
1682 if (!KNOB_TOSS_VS)
1683 #endif
1684 {
1685 if (assemble)
1686 {
1687 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1688
1689 const uint32_t numPrims = pa.NumPrims();
1690 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1691 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1692
1693 const uint32_t primMask = GenMask(numPrims);
1694 const uint32_t primMask_lo = primMask & 255;
1695 const uint32_t primMask_hi = (primMask >> 8) & 255;
1696
1697 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1698 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1699 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1700
1701 if (HasTessellationT::value)
1702 {
1703 pa.useAlternateOffset = false;
1704 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1705
1706 if (numPrims_hi)
1707 {
1708 pa.useAlternateOffset = true;
1709 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1710 }
1711 }
1712 else if (HasGeometryShaderT::value)
1713 {
1714 pa.useAlternateOffset = false;
1715 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1716
1717 if (numPrims_hi)
1718 {
1719 pa.useAlternateOffset = true;
1720 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1721 }
1722 }
1723 else
1724 {
1725 // If streamout is enabled then stream vertices out to memory.
1726 if (HasStreamOutT::value)
1727 {
1728 #if 1
1729 pa.useAlternateOffset = false;
1730 StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0);
1731
1732 if (numPrims_hi)
1733 {
1734 pa.useAlternateOffset = true;
1735 StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
1736 }
1737 #else
1738 pa.useAlternateOffset = false; // StreamOut() is SIMD16-compatible..
1739 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1740 #endif
1741 }
1742
1743 if (HasRastT::value)
1744 {
1745 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1746
1747 pa.useAlternateOffset = false;
1748 pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, primMask, primID, _simd16_setzero_si());
1749 }
1750 }
1751 }
1752 }
1753 }
1754 } while (pa.NextPrim());
1755
1756 if (IsIndexedT::value)
1757 {
1758 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1759 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1760 }
1761 else
1762 {
1763 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1764 }
1765
1766 i += KNOB_SIMD16_WIDTH;
1767 }
1768
1769 pa.Reset();
1770 }
1771
1772 #else
1773 simdvertex vin;
1774 SWR_VS_CONTEXT vsContext;
1775
1776 vsContext.pVin = &vin;
1777
1778 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1779
1780 fetchInfo.pStreams = &state.vertexBuffers[0];
1781 fetchInfo.StartInstance = work.startInstance;
1782 fetchInfo.StartVertex = 0;
1783
1784 if (IsIndexedT::value)
1785 {
1786 fetchInfo.BaseVertex = work.baseVertex;
1787
1788 // if the entire index buffer isn't being consumed, set the last index
1789 // so that fetches < a SIMD wide will be masked off
1790 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1791 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1792 {
1793 fetchInfo.pLastIndex = pLastRequestedIndex;
1794 }
1795 }
1796 else
1797 {
1798 fetchInfo.StartVertex = work.startVertex;
1799 }
1800
1801 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1802
1803 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1804 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1805 {
1806 simdscalari vIndex;
1807 uint32_t i = 0;
1808
1809 if (IsIndexedT::value)
1810 {
1811 fetchInfo.pIndices = work.pIB;
1812 }
1813 else
1814 {
1815 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1816 fetchInfo.pIndices = (const int32_t*)&vIndex;
1817 }
1818
1819 fetchInfo.CurInstance = instanceNum;
1820 vsContext.InstanceID = instanceNum;
1821
1822 while (pa.HasWork())
1823 {
1824 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1825 // So we need to keep this outside of (i < endVertex) check.
1826 simdmask* pvCutIndices = nullptr;
1827 if (IsIndexedT::value)
1828 {
1829 pvCutIndices = &pa.GetNextVsIndices();
1830 }
1831
1832 simdvertex& vout = pa.GetNextVsOutput();
1833 vsContext.pVout = &vout;
1834
1835 if (i < endVertex)
1836 {
1837
1838 // 1. Execute FS/VS for a single SIMD.
1839 AR_BEGIN(FEFetchShader, pDC->drawId);
1840 state.pfnFetchFunc(fetchInfo, vin);
1841 AR_END(FEFetchShader, 0);
1842
1843 // forward fetch generated vertex IDs to the vertex shader
1844 vsContext.VertexID = fetchInfo.VertexID;
1845
1846 // Setup active mask for vertex shader.
1847 vsContext.mask = GenerateMask(endVertex - i);
1848
1849 // forward cut mask to the PA
1850 if (IsIndexedT::value)
1851 {
1852 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1853 }
1854
1855 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1856
1857 #if KNOB_ENABLE_TOSS_POINTS
1858 if (!KNOB_TOSS_FETCH)
1859 #endif
1860 {
1861 AR_BEGIN(FEVertexShader, pDC->drawId);
1862 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1863 AR_END(FEVertexShader, 0);
1864
1865 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1866 }
1867 }
1868
1869 // 2. Assemble primitives given the last two SIMD.
1870 do
1871 {
1872 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1873 // PaAssemble returns false if there is not enough verts to assemble.
1874 AR_BEGIN(FEPAAssemble, pDC->drawId);
1875 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1876 AR_END(FEPAAssemble, 1);
1877
1878 #if KNOB_ENABLE_TOSS_POINTS
1879 if (!KNOB_TOSS_FETCH)
1880 #endif
1881 {
1882 #if KNOB_ENABLE_TOSS_POINTS
1883 if (!KNOB_TOSS_VS)
1884 #endif
1885 {
1886 if (assemble)
1887 {
1888 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1889
1890 if (HasTessellationT::value)
1891 {
1892 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1893 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1894 }
1895 else if (HasGeometryShaderT::value)
1896 {
1897 GeometryShaderStage<HasStreamOutT, HasRastT>(
1898 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1899 }
1900 else
1901 {
1902 // If streamout is enabled then stream vertices out to memory.
1903 if (HasStreamOutT::value)
1904 {
1905 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1906 }
1907
1908 if (HasRastT::value)
1909 {
1910 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1911
1912 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1913 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1914 }
1915 }
1916 }
1917 }
1918 }
1919 } while (pa.NextPrim());
1920
1921 if (IsIndexedT::value)
1922 {
1923 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1924 }
1925 else
1926 {
1927 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1928 }
1929
1930 i += KNOB_SIMD_WIDTH;
1931 }
1932 pa.Reset();
1933 }
1934
1935 #endif
1936
1937 AR_END(FEProcessDraw, numPrims * work.numInstances);
1938 }
1939
1940 struct FEDrawChooser
1941 {
1942 typedef PFN_FE_WORK_FUNC FuncType;
1943
1944 template <typename... ArgsB>
1945 static FuncType GetFunc()
1946 {
1947 return ProcessDraw<ArgsB...>;
1948 }
1949 };
1950
1951
1952 // Selector for correct templated Draw front-end function
1953 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1954 bool IsIndexed,
1955 bool IsCutIndexEnabled,
1956 bool HasTessellation,
1957 bool HasGeometryShader,
1958 bool HasStreamOut,
1959 bool HasRasterization)
1960 {
1961 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1962 }