swr/rast: fix USE_SIMD16_FRONTEND issues
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Helper macro to generate a bitmask
45 static INLINE uint32_t GenMask(uint32_t numBits)
46 {
47 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
48 return ((1U << numBits) - 1);
49 }
50
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief FE handler for SwrSync.
53 /// @param pContext - pointer to SWR context.
54 /// @param pDC - pointer to draw context.
55 /// @param workerId - thread's worker id. Even thread has a unique id.
56 /// @param pUserData - Pointer to user data passed back to sync callback.
57 /// @todo This should go away when we switch this to use compute threading.
58 void ProcessSync(
59 SWR_CONTEXT *pContext,
60 DRAW_CONTEXT *pDC,
61 uint32_t workerId,
62 void *pUserData)
63 {
64 BE_WORK work;
65 work.type = SYNC;
66 work.pfnWork = ProcessSyncBE;
67
68 MacroTileMgr *pTileMgr = pDC->pTileMgr;
69 pTileMgr->enqueue(0, 0, &work);
70 }
71
72 //////////////////////////////////////////////////////////////////////////
73 /// @brief FE handler for SwrDestroyContext.
74 /// @param pContext - pointer to SWR context.
75 /// @param pDC - pointer to draw context.
76 /// @param workerId - thread's worker id. Even thread has a unique id.
77 /// @param pUserData - Pointer to user data passed back to sync callback.
78 void ProcessShutdown(
79 SWR_CONTEXT *pContext,
80 DRAW_CONTEXT *pDC,
81 uint32_t workerId,
82 void *pUserData)
83 {
84 BE_WORK work;
85 work.type = SHUTDOWN;
86 work.pfnWork = ProcessShutdownBE;
87
88 MacroTileMgr *pTileMgr = pDC->pTileMgr;
89 // Enqueue at least 1 work item for each worker thread
90 // account for number of numa nodes
91 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
92
93 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
94 {
95 for (uint32_t n = 0; n < numNumaNodes; ++n)
96 {
97 pTileMgr->enqueue(i, n, &work);
98 }
99 }
100 }
101
102 //////////////////////////////////////////////////////////////////////////
103 /// @brief FE handler for SwrClearRenderTarget.
104 /// @param pContext - pointer to SWR context.
105 /// @param pDC - pointer to draw context.
106 /// @param workerId - thread's worker id. Even thread has a unique id.
107 /// @param pUserData - Pointer to user data passed back to clear callback.
108 /// @todo This should go away when we switch this to use compute threading.
109 void ProcessClear(
110 SWR_CONTEXT *pContext,
111 DRAW_CONTEXT *pDC,
112 uint32_t workerId,
113 void *pUserData)
114 {
115 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
116 MacroTileMgr *pTileMgr = pDC->pTileMgr;
117
118 // queue a clear to each macro tile
119 // compute macro tile bounds for the specified rect
120 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
121 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
122 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
123 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
124
125 BE_WORK work;
126 work.type = CLEAR;
127 work.pfnWork = ProcessClearBE;
128 work.desc.clear = *pDesc;
129
130 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
131 {
132 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
133 {
134 pTileMgr->enqueue(x, y, &work);
135 }
136 }
137 }
138
139 //////////////////////////////////////////////////////////////////////////
140 /// @brief FE handler for SwrStoreTiles.
141 /// @param pContext - pointer to SWR context.
142 /// @param pDC - pointer to draw context.
143 /// @param workerId - thread's worker id. Even thread has a unique id.
144 /// @param pUserData - Pointer to user data passed back to callback.
145 /// @todo This should go away when we switch this to use compute threading.
146 void ProcessStoreTiles(
147 SWR_CONTEXT *pContext,
148 DRAW_CONTEXT *pDC,
149 uint32_t workerId,
150 void *pUserData)
151 {
152 AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
153 MacroTileMgr *pTileMgr = pDC->pTileMgr;
154 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
155
156 // queue a store to each macro tile
157 // compute macro tile bounds for the specified rect
158 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
159 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
160 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
161 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
162
163 // store tiles
164 BE_WORK work;
165 work.type = STORETILES;
166 work.pfnWork = ProcessStoreTilesBE;
167 work.desc.storeTiles = *pDesc;
168
169 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
170 {
171 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
172 {
173 pTileMgr->enqueue(x, y, &work);
174 }
175 }
176
177 AR_END(FEProcessStoreTiles, 0);
178 }
179
180 //////////////////////////////////////////////////////////////////////////
181 /// @brief FE handler for SwrInvalidateTiles.
182 /// @param pContext - pointer to SWR context.
183 /// @param pDC - pointer to draw context.
184 /// @param workerId - thread's worker id. Even thread has a unique id.
185 /// @param pUserData - Pointer to user data passed back to callback.
186 /// @todo This should go away when we switch this to use compute threading.
187 void ProcessDiscardInvalidateTiles(
188 SWR_CONTEXT *pContext,
189 DRAW_CONTEXT *pDC,
190 uint32_t workerId,
191 void *pUserData)
192 {
193 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
194 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
195 MacroTileMgr *pTileMgr = pDC->pTileMgr;
196
197 // compute macro tile bounds for the specified rect
198 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
199 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
200 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
201 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
202
203 if (pDesc->fullTilesOnly == false)
204 {
205 // include partial tiles
206 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
207 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
208 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
209 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
210 }
211
212 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
213 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
214
215 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
216 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
217
218 // load tiles
219 BE_WORK work;
220 work.type = DISCARDINVALIDATETILES;
221 work.pfnWork = ProcessDiscardInvalidateTilesBE;
222 work.desc.discardInvalidateTiles = *pDesc;
223
224 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
225 {
226 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
227 {
228 pTileMgr->enqueue(x, y, &work);
229 }
230 }
231
232 AR_END(FEProcessInvalidateTiles, 0);
233 }
234
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Computes the number of primitives given the number of verts.
237 /// @param mode - primitive topology for draw operation.
238 /// @param numPrims - number of vertices or indices for draw.
239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
240 uint32_t GetNumPrims(
241 PRIMITIVE_TOPOLOGY mode,
242 uint32_t numPrims)
243 {
244 switch (mode)
245 {
246 case TOP_POINT_LIST: return numPrims;
247 case TOP_TRIANGLE_LIST: return numPrims / 3;
248 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
249 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
250 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
251 case TOP_QUAD_LIST: return numPrims / 4;
252 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
253 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
254 case TOP_LINE_LIST: return numPrims / 2;
255 case TOP_LINE_LOOP: return numPrims;
256 case TOP_RECT_LIST: return numPrims / 3;
257 case TOP_LINE_LIST_ADJ: return numPrims / 4;
258 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
259 case TOP_TRI_LIST_ADJ: return numPrims / 6;
260 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
261
262 case TOP_PATCHLIST_1:
263 case TOP_PATCHLIST_2:
264 case TOP_PATCHLIST_3:
265 case TOP_PATCHLIST_4:
266 case TOP_PATCHLIST_5:
267 case TOP_PATCHLIST_6:
268 case TOP_PATCHLIST_7:
269 case TOP_PATCHLIST_8:
270 case TOP_PATCHLIST_9:
271 case TOP_PATCHLIST_10:
272 case TOP_PATCHLIST_11:
273 case TOP_PATCHLIST_12:
274 case TOP_PATCHLIST_13:
275 case TOP_PATCHLIST_14:
276 case TOP_PATCHLIST_15:
277 case TOP_PATCHLIST_16:
278 case TOP_PATCHLIST_17:
279 case TOP_PATCHLIST_18:
280 case TOP_PATCHLIST_19:
281 case TOP_PATCHLIST_20:
282 case TOP_PATCHLIST_21:
283 case TOP_PATCHLIST_22:
284 case TOP_PATCHLIST_23:
285 case TOP_PATCHLIST_24:
286 case TOP_PATCHLIST_25:
287 case TOP_PATCHLIST_26:
288 case TOP_PATCHLIST_27:
289 case TOP_PATCHLIST_28:
290 case TOP_PATCHLIST_29:
291 case TOP_PATCHLIST_30:
292 case TOP_PATCHLIST_31:
293 case TOP_PATCHLIST_32:
294 return numPrims / (mode - TOP_PATCHLIST_BASE);
295
296 case TOP_POLYGON:
297 case TOP_POINT_LIST_BF:
298 case TOP_LINE_STRIP_CONT:
299 case TOP_LINE_STRIP_BF:
300 case TOP_LINE_STRIP_CONT_BF:
301 case TOP_TRIANGLE_FAN_NOSTIPPLE:
302 case TOP_TRI_STRIP_REVERSE:
303 case TOP_PATCHLIST_BASE:
304 case TOP_UNKNOWN:
305 SWR_INVALID("Unsupported topology: %d", mode);
306 return 0;
307 }
308
309 return 0;
310 }
311
312 //////////////////////////////////////////////////////////////////////////
313 /// @brief Computes the number of verts given the number of primitives.
314 /// @param mode - primitive topology for draw operation.
315 /// @param numPrims - number of primitives for draw.
316 uint32_t GetNumVerts(
317 PRIMITIVE_TOPOLOGY mode,
318 uint32_t numPrims)
319 {
320 switch (mode)
321 {
322 case TOP_POINT_LIST: return numPrims;
323 case TOP_TRIANGLE_LIST: return numPrims * 3;
324 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
325 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
326 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
327 case TOP_QUAD_LIST: return numPrims * 4;
328 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
329 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
330 case TOP_LINE_LIST: return numPrims * 2;
331 case TOP_LINE_LOOP: return numPrims;
332 case TOP_RECT_LIST: return numPrims * 3;
333 case TOP_LINE_LIST_ADJ: return numPrims * 4;
334 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
335 case TOP_TRI_LIST_ADJ: return numPrims * 6;
336 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
337
338 case TOP_PATCHLIST_1:
339 case TOP_PATCHLIST_2:
340 case TOP_PATCHLIST_3:
341 case TOP_PATCHLIST_4:
342 case TOP_PATCHLIST_5:
343 case TOP_PATCHLIST_6:
344 case TOP_PATCHLIST_7:
345 case TOP_PATCHLIST_8:
346 case TOP_PATCHLIST_9:
347 case TOP_PATCHLIST_10:
348 case TOP_PATCHLIST_11:
349 case TOP_PATCHLIST_12:
350 case TOP_PATCHLIST_13:
351 case TOP_PATCHLIST_14:
352 case TOP_PATCHLIST_15:
353 case TOP_PATCHLIST_16:
354 case TOP_PATCHLIST_17:
355 case TOP_PATCHLIST_18:
356 case TOP_PATCHLIST_19:
357 case TOP_PATCHLIST_20:
358 case TOP_PATCHLIST_21:
359 case TOP_PATCHLIST_22:
360 case TOP_PATCHLIST_23:
361 case TOP_PATCHLIST_24:
362 case TOP_PATCHLIST_25:
363 case TOP_PATCHLIST_26:
364 case TOP_PATCHLIST_27:
365 case TOP_PATCHLIST_28:
366 case TOP_PATCHLIST_29:
367 case TOP_PATCHLIST_30:
368 case TOP_PATCHLIST_31:
369 case TOP_PATCHLIST_32:
370 return numPrims * (mode - TOP_PATCHLIST_BASE);
371
372 case TOP_POLYGON:
373 case TOP_POINT_LIST_BF:
374 case TOP_LINE_STRIP_CONT:
375 case TOP_LINE_STRIP_BF:
376 case TOP_LINE_STRIP_CONT_BF:
377 case TOP_TRIANGLE_FAN_NOSTIPPLE:
378 case TOP_TRI_STRIP_REVERSE:
379 case TOP_PATCHLIST_BASE:
380 case TOP_UNKNOWN:
381 SWR_INVALID("Unsupported topology: %d", mode);
382 return 0;
383 }
384
385 return 0;
386 }
387
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Return number of verts per primitive.
390 /// @param topology - topology
391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
393 {
394 uint32_t numVerts = 0;
395 switch (topology)
396 {
397 case TOP_POINT_LIST:
398 case TOP_POINT_LIST_BF:
399 numVerts = 1;
400 break;
401 case TOP_LINE_LIST:
402 case TOP_LINE_STRIP:
403 case TOP_LINE_LIST_ADJ:
404 case TOP_LINE_LOOP:
405 case TOP_LINE_STRIP_CONT:
406 case TOP_LINE_STRIP_BF:
407 case TOP_LISTSTRIP_ADJ:
408 numVerts = 2;
409 break;
410 case TOP_TRIANGLE_LIST:
411 case TOP_TRIANGLE_STRIP:
412 case TOP_TRIANGLE_FAN:
413 case TOP_TRI_LIST_ADJ:
414 case TOP_TRI_STRIP_ADJ:
415 case TOP_TRI_STRIP_REVERSE:
416 case TOP_RECT_LIST:
417 numVerts = 3;
418 break;
419 case TOP_QUAD_LIST:
420 case TOP_QUAD_STRIP:
421 numVerts = 4;
422 break;
423 case TOP_PATCHLIST_1:
424 case TOP_PATCHLIST_2:
425 case TOP_PATCHLIST_3:
426 case TOP_PATCHLIST_4:
427 case TOP_PATCHLIST_5:
428 case TOP_PATCHLIST_6:
429 case TOP_PATCHLIST_7:
430 case TOP_PATCHLIST_8:
431 case TOP_PATCHLIST_9:
432 case TOP_PATCHLIST_10:
433 case TOP_PATCHLIST_11:
434 case TOP_PATCHLIST_12:
435 case TOP_PATCHLIST_13:
436 case TOP_PATCHLIST_14:
437 case TOP_PATCHLIST_15:
438 case TOP_PATCHLIST_16:
439 case TOP_PATCHLIST_17:
440 case TOP_PATCHLIST_18:
441 case TOP_PATCHLIST_19:
442 case TOP_PATCHLIST_20:
443 case TOP_PATCHLIST_21:
444 case TOP_PATCHLIST_22:
445 case TOP_PATCHLIST_23:
446 case TOP_PATCHLIST_24:
447 case TOP_PATCHLIST_25:
448 case TOP_PATCHLIST_26:
449 case TOP_PATCHLIST_27:
450 case TOP_PATCHLIST_28:
451 case TOP_PATCHLIST_29:
452 case TOP_PATCHLIST_30:
453 case TOP_PATCHLIST_31:
454 case TOP_PATCHLIST_32:
455 numVerts = topology - TOP_PATCHLIST_BASE;
456 break;
457 default:
458 SWR_INVALID("Unsupported topology: %d", topology);
459 break;
460 }
461
462 if (includeAdjVerts)
463 {
464 switch (topology)
465 {
466 case TOP_LISTSTRIP_ADJ:
467 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
468 case TOP_TRI_STRIP_ADJ:
469 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
470 default: break;
471 }
472 }
473
474 return numVerts;
475 }
476
477 //////////////////////////////////////////////////////////////////////////
478 /// @brief Generate mask from remaining work.
479 /// @param numWorkItems - Number of items being worked on by a SIMD.
480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
481 {
482 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
483 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
484 return _simd_castps_si(_simd_vmask_ps(mask));
485 }
486
487 //////////////////////////////////////////////////////////////////////////
488 /// @brief StreamOut - Streams vertex data out to SO buffers.
489 /// Generally, we are only streaming out a SIMDs worth of triangles.
490 /// @param pDC - pointer to draw context.
491 /// @param workerId - thread's worker id. Even thread has a unique id.
492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
493 static void StreamOut(
494 DRAW_CONTEXT* pDC,
495 PA_STATE& pa,
496 uint32_t workerId,
497 uint32_t* pPrimData,
498 uint32_t streamIndex)
499 {
500 SWR_CONTEXT *pContext = pDC->pContext;
501
502 AR_BEGIN(FEStreamout, pDC->drawId);
503
504 const API_STATE& state = GetApiState(pDC);
505 const SWR_STREAMOUT_STATE &soState = state.soState;
506
507 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
508
509 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
510 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
511
512 SWR_STREAMOUT_CONTEXT soContext = { 0 };
513
514 // Setup buffer state pointers.
515 for (uint32_t i = 0; i < 4; ++i)
516 {
517 soContext.pBuffer[i] = &state.soBuffer[i];
518 }
519
520 uint32_t numPrims = pa.NumPrims();
521
522 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
523 {
524 DWORD slot = 0;
525 uint32_t soMask = soState.streamMasks[streamIndex];
526
527 // Write all entries into primitive data buffer for SOS.
528 while (_BitScanForward(&slot, soMask))
529 {
530 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
531 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
532 pa.AssembleSingle(paSlot, primIndex, attrib);
533
534 // Attribute offset is relative offset from start of vertex.
535 // Note that attributes start at slot 1 in the PA buffer. We need to write this
536 // to prim data starting at slot 0. Which is why we do (slot - 1).
537 // Also note: GL works slightly differently, and needs slot 0
538 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
539
540 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
541 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
542 {
543 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
544
545 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
546 }
547
548 soMask &= ~(1 << slot);
549 }
550
551 // Update pPrimData pointer
552 soContext.pPrimData = pPrimData;
553
554 // Call SOS
555 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
556 state.pfnSoFunc[streamIndex](soContext);
557 }
558
559 // Update SO write offset. The driver provides memory for the update.
560 for (uint32_t i = 0; i < 4; ++i)
561 {
562 if (state.soBuffer[i].pWriteOffset)
563 {
564 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
565 }
566
567 if (state.soBuffer[i].soWriteEnable)
568 {
569 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
570 pDC->dynState.SoWriteOffsetDirty[i] = true;
571 }
572 }
573
574 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
575 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
576
577 AR_END(FEStreamout, 1);
578 }
579
580 #if USE_SIMD16_FRONTEND
581 //////////////////////////////////////////////////////////////////////////
582 /// Is value an even number (a multiple of two)
583 ///
584 template <typename T>
585 INLINE static bool IsEven(T value)
586 {
587 return (value & 1) == 0;
588 }
589
590 //////////////////////////////////////////////////////////////////////////
591 /// Round up value to an even number (a multiple of two)
592 ///
593 template <typename T>
594 INLINE static T RoundUpEven(T value)
595 {
596 return (value + 1) & ~1;
597 }
598
599 //////////////////////////////////////////////////////////////////////////
600 /// Round down value to an even number (a multiple of two)
601 ///
602 template <typename T>
603 INLINE static T RoundDownEven(T value)
604 {
605 return value & ~1;
606 }
607
608 //////////////////////////////////////////////////////////////////////////
609 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
610 ///
611 /// vertexCount is in terms of the source simdvertexes and must be even
612 ///
613 /// attribCount will limit the vector copies to those attribs specified
614 ///
615 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
616 ///
617 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
618 {
619 SWR_ASSERT(vertex);
620 SWR_ASSERT(vertex_simd16);
621 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
622
623 simd16vertex temp;
624
625 for (uint32_t i = 0; i < vertexCount; i += 2)
626 {
627 for (uint32_t j = 0; j < attribCount; j += 1)
628 {
629 for (uint32_t k = 0; k < 4; k += 1)
630 {
631 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
632
633 if ((i + 1) < vertexCount)
634 {
635 temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
636 }
637 }
638 }
639
640 for (uint32_t j = 0; j < attribCount; j += 1)
641 {
642 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
643 }
644 }
645 }
646
647 #endif
648 //////////////////////////////////////////////////////////////////////////
649 /// @brief Computes number of invocations. The current index represents
650 /// the start of the SIMD. The max index represents how much work
651 /// items are remaining. If there is less then a SIMD's xmin of work
652 /// then return the remaining amount of work.
653 /// @param curIndex - The start index for the SIMD.
654 /// @param maxIndex - The last index for all work items.
655 static INLINE uint32_t GetNumInvocations(
656 uint32_t curIndex,
657 uint32_t maxIndex)
658 {
659 uint32_t remainder = (maxIndex - curIndex);
660 #if USE_SIMD16_FRONTEND
661 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
662 #else
663 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
664 #endif
665 }
666
667 //////////////////////////////////////////////////////////////////////////
668 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
669 /// The geometry shader will loop over each active streamout buffer, assembling
670 /// primitives for the downstream stages. When multistream output is enabled,
671 /// the generated stream ID buffer from the GS needs to be converted to a cut
672 /// buffer for the primitive assembler.
673 /// @param stream - stream id to generate the cut buffer for
674 /// @param pStreamIdBase - pointer to the stream ID buffer
675 /// @param numEmittedVerts - Number of total verts emitted by the GS
676 /// @param pCutBuffer - output buffer to write cuts to
677 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
678 {
679 SWR_ASSERT(stream < MAX_SO_STREAMS);
680
681 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
682 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
683
684 for (uint32_t b = 0; b < numOutputBytes; ++b)
685 {
686 uint8_t curInputByte = pStreamIdBase[2*b];
687 uint8_t outByte = 0;
688 for (uint32_t i = 0; i < 4; ++i)
689 {
690 if ((curInputByte & 0x3) != stream)
691 {
692 outByte |= (1 << i);
693 }
694 curInputByte >>= 2;
695 }
696
697 curInputByte = pStreamIdBase[2 * b + 1];
698 for (uint32_t i = 0; i < 4; ++i)
699 {
700 if ((curInputByte & 0x3) != stream)
701 {
702 outByte |= (1 << (i + 4));
703 }
704 curInputByte >>= 2;
705 }
706
707 *pCutBuffer++ = outByte;
708 }
709 }
710
711 THREAD SWR_GS_CONTEXT tlsGsContext;
712
713 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
714 struct GsBufferInfo
715 {
716 GsBufferInfo(const SWR_GS_STATE &gsState)
717 {
718 const uint32_t vertexCount = gsState.maxNumVerts;
719 const uint32_t vertexStride = sizeof(SIMDVERTEX);
720 const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
721
722 vertexPrimitiveStride = vertexStride * numSimdBatches;
723 vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
724
725 if (gsState.isSingleStream)
726 {
727 cutPrimitiveStride = (vertexCount + 7) / 8;
728 cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
729
730 streamCutPrimitiveStride = 0;
731 streamCutInstanceStride = 0;
732 }
733 else
734 {
735 cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
736 cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
737
738 streamCutPrimitiveStride = (vertexCount + 7) / 8;
739 streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
740 }
741 }
742
743 uint32_t vertexPrimitiveStride;
744 uint32_t vertexInstanceStride;
745
746 uint32_t cutPrimitiveStride;
747 uint32_t cutInstanceStride;
748
749 uint32_t streamCutPrimitiveStride;
750 uint32_t streamCutInstanceStride;
751 };
752
753 //////////////////////////////////////////////////////////////////////////
754 /// @brief Implements GS stage.
755 /// @param pDC - pointer to draw context.
756 /// @param workerId - thread's worker id. Even thread has a unique id.
757 /// @param pa - The primitive assembly object.
758 /// @param pGsOut - output stream for GS
759 template <
760 typename HasStreamOutT,
761 typename HasRastT>
762 static void GeometryShaderStage(
763 DRAW_CONTEXT *pDC,
764 uint32_t workerId,
765 PA_STATE& pa,
766 void* pGsOut,
767 void* pCutBuffer,
768 void* pStreamCutBuffer,
769 uint32_t* pSoPrimData,
770 #if USE_SIMD16_FRONTEND
771 uint32_t numPrims_simd8,
772 #endif
773 simdscalari primID)
774 {
775 SWR_CONTEXT *pContext = pDC->pContext;
776
777 AR_BEGIN(FEGeometryShader, pDC->drawId);
778
779 const API_STATE& state = GetApiState(pDC);
780 const SWR_GS_STATE* pState = &state.gsState;
781
782 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
783 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
784
785 tlsGsContext.pStream = (uint8_t*)pGsOut;
786 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
787 tlsGsContext.PrimitiveID = primID;
788
789 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
790 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
791
792 // assemble all attributes for the input primitive
793 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
794 {
795 uint32_t attribSlot = pState->vertexAttribOffset + slot;
796 pa.Assemble(attribSlot, attrib);
797
798 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
799 {
800 tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
801 }
802 }
803
804 // assemble position
805 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
806 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
807 {
808 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
809 }
810
811 #if USE_SIMD16_FRONTEND
812 const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
813 #else
814 const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
815 #endif
816
817 // record valid prims from the frontend to avoid over binning the newly generated
818 // prims from the GS
819 #if USE_SIMD16_FRONTEND
820 uint32_t numInputPrims = numPrims_simd8;
821 #else
822 uint32_t numInputPrims = pa.NumPrims();
823 #endif
824
825 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
826 {
827 tlsGsContext.InstanceID = instance;
828 tlsGsContext.mask = GenerateMask(numInputPrims);
829
830 // execute the geometry shader
831 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
832
833 tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
834 tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
835 }
836
837 // set up new binner and state for the GS output topology
838 #if USE_SIMD16_FRONTEND
839 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
840 if (HasRastT::value)
841 {
842 switch (pState->outputTopology)
843 {
844 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break;
845 case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break;
846 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
847 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
848 }
849 }
850
851 #else
852 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
853 if (HasRastT::value)
854 {
855 switch (pState->outputTopology)
856 {
857 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
858 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
859 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
860 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
861 }
862 }
863
864 #endif
865 // foreach input prim:
866 // - setup a new PA based on the emitted verts for that prim
867 // - loop over the new verts, calling PA to assemble each prim
868 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
869 uint32_t* pPrimitiveId = (uint32_t*)&primID;
870
871 uint32_t totalPrimsGenerated = 0;
872 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
873 {
874 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
875 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
876
877 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
878 {
879 uint32_t numEmittedVerts = pVertexCount[inputPrim];
880 if (numEmittedVerts == 0)
881 {
882 continue;
883 }
884
885 uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
886 uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
887
888 uint32_t numAttribs = state.feNumAttributes;
889
890 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
891 {
892 bool processCutVerts = false;
893
894 uint8_t* pCutBuffer = pCutBase;
895
896 // assign default stream ID, only relevant when GS is outputting a single stream
897 uint32_t streamID = 0;
898 if (pState->isSingleStream)
899 {
900 processCutVerts = true;
901 streamID = pState->singleStreamID;
902 if (streamID != stream) continue;
903 }
904 else
905 {
906 // early exit if this stream is not enabled for streamout
907 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
908 {
909 continue;
910 }
911
912 // multi-stream output, need to translate StreamID buffer to a cut buffer
913 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
914 pCutBuffer = (uint8_t*)pStreamCutBuffer;
915 processCutVerts = false;
916 }
917
918 #if USE_SIMD16_FRONTEND
919 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
920
921 #else
922 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
923
924 #endif
925 while (gsPa.GetNextStreamOutput())
926 {
927 do
928 {
929 #if USE_SIMD16_FRONTEND
930 simd16vector attrib_simd16[3];
931
932 bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
933
934 #else
935 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
936
937 #endif
938 if (assemble)
939 {
940 totalPrimsGenerated += gsPa.NumPrims();
941
942 if (HasStreamOutT::value)
943 {
944 #if ENABLE_AVX512_SIMD16
945 gsPa.useAlternateOffset = false;
946 #endif
947 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
948 }
949
950 if (HasRastT::value && state.soState.streamToRasterizer == stream)
951 {
952 #if USE_SIMD16_FRONTEND
953 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
954
955 gsPa.useAlternateOffset = false;
956 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
957 #else
958 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
959 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
960 #endif
961 }
962 }
963 } while (gsPa.NextPrim());
964 }
965 }
966 }
967 }
968
969 // update GS pipeline stats
970 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
971 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
972 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
973 AR_END(FEGeometryShader, 1);
974 }
975
976 //////////////////////////////////////////////////////////////////////////
977 /// @brief Allocate GS buffers
978 /// @param pDC - pointer to draw context.
979 /// @param state - API state
980 /// @param ppGsOut - pointer to GS output buffer allocation
981 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
982 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
983 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
984 void **ppStreamCutBuffer)
985 {
986 auto pArena = pDC->pArena;
987 SWR_ASSERT(pArena != nullptr);
988 SWR_ASSERT(state.gsState.gsEnable);
989
990 // allocate arena space to hold GS output verts
991 // @todo pack attribs
992 // @todo support multiple streams
993
994 const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
995
996 const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
997
998 *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
999
1000 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
1001 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
1002
1003 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
1004 if (state.gsState.isSingleStream)
1005 {
1006 const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1007
1008 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1009 *ppStreamCutBuffer = nullptr;
1010 }
1011 else
1012 {
1013 const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1014 const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
1015
1016 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1017 *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
1018 }
1019 }
1020
1021 //////////////////////////////////////////////////////////////////////////
1022 /// @brief Contains all data generated by the HS and passed to the
1023 /// tessellator and DS.
1024 struct TessellationThreadLocalData
1025 {
1026 SWR_HS_CONTEXT hsContext;
1027 ScalarPatch patchData[KNOB_SIMD_WIDTH];
1028 void* pTxCtx;
1029 size_t tsCtxSize;
1030
1031 simdscalar* pDSOutput;
1032 size_t numDSOutputVectors;
1033 };
1034
1035 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1036
1037 //////////////////////////////////////////////////////////////////////////
1038 /// @brief Allocate tessellation data for this worker thread.
1039 INLINE
1040 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1041 {
1042 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1043 if (gt_pTessellationThreadData == nullptr)
1044 {
1045 gt_pTessellationThreadData = (TessellationThreadLocalData*)
1046 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1047 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1048 }
1049 }
1050
1051 //////////////////////////////////////////////////////////////////////////
1052 /// @brief Implements Tessellation Stages.
1053 /// @param pDC - pointer to draw context.
1054 /// @param workerId - thread's worker id. Even thread has a unique id.
1055 /// @param pa - The primitive assembly object.
1056 /// @param pGsOut - output stream for GS
1057 template <
1058 typename HasGeometryShaderT,
1059 typename HasStreamOutT,
1060 typename HasRastT>
1061 static void TessellationStages(
1062 DRAW_CONTEXT *pDC,
1063 uint32_t workerId,
1064 PA_STATE& pa,
1065 void* pGsOut,
1066 void* pCutBuffer,
1067 void* pCutStreamBuffer,
1068 uint32_t* pSoPrimData,
1069 #if USE_SIMD16_FRONTEND
1070 uint32_t numPrims_simd8,
1071 #endif
1072 simdscalari primID)
1073 {
1074 SWR_CONTEXT *pContext = pDC->pContext;
1075 const API_STATE& state = GetApiState(pDC);
1076 const SWR_TS_STATE& tsState = state.tsState;
1077
1078 SWR_ASSERT(gt_pTessellationThreadData);
1079
1080 HANDLE tsCtx = TSInitCtx(
1081 tsState.domain,
1082 tsState.partitioning,
1083 tsState.tsOutputTopology,
1084 gt_pTessellationThreadData->pTxCtx,
1085 gt_pTessellationThreadData->tsCtxSize);
1086 if (tsCtx == nullptr)
1087 {
1088 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1089 tsCtx = TSInitCtx(
1090 tsState.domain,
1091 tsState.partitioning,
1092 tsState.tsOutputTopology,
1093 gt_pTessellationThreadData->pTxCtx,
1094 gt_pTessellationThreadData->tsCtxSize);
1095 }
1096 SWR_ASSERT(tsCtx);
1097
1098 #if USE_SIMD16_FRONTEND
1099 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1100 if (HasRastT::value)
1101 {
1102 switch (tsState.postDSTopology)
1103 {
1104 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1105 case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break;
1106 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
1107 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1108 }
1109 }
1110
1111 #else
1112 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1113 if (HasRastT::value)
1114 {
1115 switch (tsState.postDSTopology)
1116 {
1117 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1118 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1119 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1120 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1121 }
1122 }
1123
1124 #endif
1125 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1126 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1127 hsContext.PrimitiveID = primID;
1128
1129 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1130 // Max storage for one attribute for an entire simdprimitive
1131 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1132
1133 // assemble all attributes for the input primitives
1134 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1135 {
1136 uint32_t attribSlot = tsState.vertexAttribOffset + slot;
1137 pa.Assemble(attribSlot, simdattrib);
1138
1139 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1140 {
1141 hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
1142 }
1143 }
1144
1145 #if defined(_DEBUG)
1146 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1147 #endif
1148
1149 #if USE_SIMD16_FRONTEND
1150 uint32_t numPrims = numPrims_simd8;
1151 #else
1152 uint32_t numPrims = pa.NumPrims();
1153 #endif
1154 hsContext.mask = GenerateMask(numPrims);
1155
1156 // Run the HS
1157 AR_BEGIN(FEHullShader, pDC->drawId);
1158 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1159 AR_END(FEHullShader, 0);
1160
1161 UPDATE_STAT_FE(HsInvocations, numPrims);
1162
1163 const uint32_t* pPrimId = (const uint32_t*)&primID;
1164
1165 for (uint32_t p = 0; p < numPrims; ++p)
1166 {
1167 // Run Tessellator
1168 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1169 AR_BEGIN(FETessellation, pDC->drawId);
1170 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1171 AR_EVENT(TessPrimCount(1));
1172 AR_END(FETessellation, 0);
1173
1174 if (tsData.NumPrimitives == 0)
1175 {
1176 continue;
1177 }
1178 SWR_ASSERT(tsData.NumDomainPoints);
1179
1180 // Allocate DS Output memory
1181 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1182 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1183 #if USE_SIMD16_FRONTEND
1184 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
1185 #else
1186 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1187 #endif
1188 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1189 {
1190 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1191 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1192 #if USE_SIMD16_FRONTEND
1193 gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
1194 #else
1195 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1196 #endif
1197 }
1198 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1199 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1200
1201 #if defined(_DEBUG)
1202 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1203 #endif
1204
1205 // Run Domain Shader
1206 SWR_DS_CONTEXT dsContext;
1207 dsContext.PrimitiveID = pPrimId[p];
1208 dsContext.pCpIn = &hsContext.pCPout[p];
1209 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1210 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1211 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1212 #if USE_SIMD16_FRONTEND
1213 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1214 #else
1215 dsContext.vectorStride = requiredDSVectorInvocations;
1216 #endif
1217
1218 uint32_t dsInvocations = 0;
1219
1220 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1221 {
1222 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1223
1224 AR_BEGIN(FEDomainShader, pDC->drawId);
1225 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1226 AR_END(FEDomainShader, 0);
1227
1228 dsInvocations += KNOB_SIMD_WIDTH;
1229 }
1230 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1231
1232 #if USE_SIMD16_FRONTEND
1233 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1234
1235 #endif
1236 PA_TESS tessPa(
1237 pDC,
1238 #if USE_SIMD16_FRONTEND
1239 reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
1240 dsContext.vectorStride / 2, // simd8 -> simd16
1241 #else
1242 dsContext.pOutputData,
1243 dsContext.vectorStride,
1244 #endif
1245 SWR_VTX_NUM_SLOTS,
1246 tsState.numDsOutputAttribs,
1247 tsData.ppIndices,
1248 tsData.NumPrimitives,
1249 tsState.postDSTopology);
1250
1251 while (tessPa.HasWork())
1252 {
1253 #if USE_SIMD16_FRONTEND
1254 const uint32_t numPrims = tessPa.NumPrims();
1255 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1256 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1257
1258 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1259 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1260 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1261
1262 #endif
1263 if (HasGeometryShaderT::value)
1264 {
1265 #if USE_SIMD16_FRONTEND
1266 tessPa.useAlternateOffset = false;
1267 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
1268
1269 if (numPrims_hi)
1270 {
1271 tessPa.useAlternateOffset = true;
1272 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
1273 }
1274 #else
1275 GeometryShaderStage<HasStreamOutT, HasRastT>(
1276 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1277 _simd_set1_epi32(dsContext.PrimitiveID));
1278 #endif
1279 }
1280 else
1281 {
1282 if (HasStreamOutT::value)
1283 {
1284 #if ENABLE_AVX512_SIMD16
1285 tessPa.useAlternateOffset = false;
1286 #endif
1287 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1288 }
1289
1290 if (HasRastT::value)
1291 {
1292 #if USE_SIMD16_FRONTEND
1293 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1294 #else
1295 simdvector prim[3]; // Only deal with triangles, lines, or points
1296 #endif
1297 AR_BEGIN(FEPAAssemble, pDC->drawId);
1298 bool assemble =
1299 #if USE_SIMD16_FRONTEND
1300 tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1301 #else
1302 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1303 #endif
1304 AR_END(FEPAAssemble, 1);
1305 SWR_ASSERT(assemble);
1306
1307 SWR_ASSERT(pfnClipFunc);
1308 #if USE_SIMD16_FRONTEND
1309 tessPa.useAlternateOffset = false;
1310 pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID);
1311 #else
1312 pfnClipFunc(pDC, tessPa, workerId, prim,
1313 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1314 #endif
1315 }
1316 }
1317
1318 tessPa.NextPrim();
1319
1320 } // while (tessPa.HasWork())
1321 } // for (uint32_t p = 0; p < numPrims; ++p)
1322
1323 #if USE_SIMD16_FRONTEND
1324 if (gt_pTessellationThreadData->pDSOutput != nullptr)
1325 {
1326 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1327 gt_pTessellationThreadData->pDSOutput = nullptr;
1328 }
1329 gt_pTessellationThreadData->numDSOutputVectors = 0;
1330
1331 #endif
1332 TSDestroyCtx(tsCtx);
1333 }
1334
1335 THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
1336 THREAD uint32_t gVertexStoreSize = 0;
1337
1338 //////////////////////////////////////////////////////////////////////////
1339 /// @brief FE handler for SwrDraw.
1340 /// @tparam IsIndexedT - Is indexed drawing enabled
1341 /// @tparam HasTessellationT - Is tessellation enabled
1342 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1343 /// @tparam HasStreamOutT - Is stream-out enabled
1344 /// @tparam HasRastT - Is rasterization enabled
1345 /// @param pContext - pointer to SWR context.
1346 /// @param pDC - pointer to draw context.
1347 /// @param workerId - thread's worker id.
1348 /// @param pUserData - Pointer to DRAW_WORK
1349 template <
1350 typename IsIndexedT,
1351 typename IsCutIndexEnabledT,
1352 typename HasTessellationT,
1353 typename HasGeometryShaderT,
1354 typename HasStreamOutT,
1355 typename HasRastT>
1356 void ProcessDraw(
1357 SWR_CONTEXT *pContext,
1358 DRAW_CONTEXT *pDC,
1359 uint32_t workerId,
1360 void *pUserData)
1361 {
1362
1363 #if KNOB_ENABLE_TOSS_POINTS
1364 if (KNOB_TOSS_QUEUE_FE)
1365 {
1366 return;
1367 }
1368 #endif
1369
1370 AR_BEGIN(FEProcessDraw, pDC->drawId);
1371
1372 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1373 const API_STATE& state = GetApiState(pDC);
1374
1375 uint32_t indexSize = 0;
1376 uint32_t endVertex = work.numVerts;
1377
1378 const int32_t* pLastRequestedIndex = nullptr;
1379 if (IsIndexedT::value)
1380 {
1381 switch (work.type)
1382 {
1383 case R32_UINT:
1384 indexSize = sizeof(uint32_t);
1385 pLastRequestedIndex = &(work.pIB[endVertex]);
1386 break;
1387 case R16_UINT:
1388 indexSize = sizeof(uint16_t);
1389 // nasty address offset to last index
1390 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1391 break;
1392 case R8_UINT:
1393 indexSize = sizeof(uint8_t);
1394 // nasty address offset to last index
1395 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1396 break;
1397 default:
1398 SWR_INVALID("Invalid work.type: %d", work.type);
1399 }
1400 }
1401 else
1402 {
1403 // No cuts, prune partial primitives.
1404 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1405 }
1406
1407 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1408 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1409 #endif
1410
1411 void* pGsOut = nullptr;
1412 void* pCutBuffer = nullptr;
1413 void* pStreamCutBuffer = nullptr;
1414 if (HasGeometryShaderT::value)
1415 {
1416 #if USE_SIMD16_FRONTEND
1417 AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1418 #else
1419 AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1420 #endif
1421 }
1422
1423 if (HasTessellationT::value)
1424 {
1425 SWR_ASSERT(state.tsState.tsEnable == true);
1426 SWR_ASSERT(state.pfnHsFunc != nullptr);
1427 SWR_ASSERT(state.pfnDsFunc != nullptr);
1428
1429 AllocateTessellationData(pContext);
1430 }
1431 else
1432 {
1433 SWR_ASSERT(state.tsState.tsEnable == false);
1434 SWR_ASSERT(state.pfnHsFunc == nullptr);
1435 SWR_ASSERT(state.pfnDsFunc == nullptr);
1436 }
1437
1438 // allocate space for streamout input prim data
1439 uint32_t* pSoPrimData = nullptr;
1440 if (HasStreamOutT::value)
1441 {
1442 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1443 }
1444
1445 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1446 #if USE_SIMD16_FRONTEND
1447 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1448 #else
1449 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1450 #endif
1451
1452 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1453
1454 // Compute storage requirements for vertex store
1455 // TODO: allocation needs to be rethought for better cut support
1456 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1457 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1458
1459 // grow the vertex store for the PA as necessary
1460 if (gVertexStoreSize < vertexStoreSize)
1461 {
1462 if (pVertexStore != nullptr)
1463 {
1464 AlignedFree(pVertexStore);
1465 }
1466
1467 pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
1468 gVertexStoreSize = vertexStoreSize;
1469
1470 SWR_ASSERT(pVertexStore != nullptr);
1471 }
1472
1473 // choose primitive assembler
1474 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, numVerts, state.frontendState.vsVertexSize);
1475 PA_STATE& pa = paFactory.GetPA();
1476
1477 #if USE_SIMD16_FRONTEND
1478 simdvertex vin_lo;
1479 simdvertex vin_hi;
1480 SWR_VS_CONTEXT vsContext_lo;
1481 SWR_VS_CONTEXT vsContext_hi;
1482
1483 vsContext_lo.pVin = &vin_lo;
1484 vsContext_hi.pVin = &vin_hi;
1485 vsContext_lo.AlternateOffset = 0;
1486 vsContext_hi.AlternateOffset = 1;
1487
1488 SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
1489
1490 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1491 fetchInfo_lo.StartInstance = work.startInstance;
1492 fetchInfo_lo.StartVertex = 0;
1493
1494 if (IsIndexedT::value)
1495 {
1496 fetchInfo_lo.BaseVertex = work.baseVertex;
1497
1498 // if the entire index buffer isn't being consumed, set the last index
1499 // so that fetches < a SIMD wide will be masked off
1500 fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1501 if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1502 {
1503 fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1504 }
1505 }
1506 else
1507 {
1508 fetchInfo_lo.StartVertex = work.startVertex;
1509 }
1510
1511 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1512
1513 const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1514
1515 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1516 {
1517 uint32_t i = 0;
1518
1519 simd16scalari vIndex;
1520
1521 if (IsIndexedT::value)
1522 {
1523 fetchInfo_lo.pIndices = work.pIB;
1524 fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH
1525 }
1526 else
1527 {
1528 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1529
1530 fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
1531 fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1532 }
1533
1534 fetchInfo_lo.CurInstance = instanceNum;
1535 fetchInfo_hi.CurInstance = instanceNum;
1536
1537 vsContext_lo.InstanceID = instanceNum;
1538 vsContext_hi.InstanceID = instanceNum;
1539
1540 while (pa.HasWork())
1541 {
1542 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1543 // So we need to keep this outside of (i < endVertex) check.
1544
1545 simdmask *pvCutIndices_lo = nullptr;
1546 simdmask *pvCutIndices_hi = nullptr;
1547
1548 if (IsIndexedT::value)
1549 {
1550 // simd16mask <=> simdmask[2]
1551
1552 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1553 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1554 }
1555
1556 simd16vertex &vout = pa.GetNextVsOutput();
1557
1558 vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1559 vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1560
1561 if (i < endVertex)
1562 {
1563 // 1. Execute FS/VS for a single SIMD.
1564 AR_BEGIN(FEFetchShader, pDC->drawId);
1565 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
1566
1567 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1568 {
1569 state.pfnFetchFunc(fetchInfo_hi, vin_hi);
1570 }
1571 AR_END(FEFetchShader, 0);
1572
1573 // forward fetch generated vertex IDs to the vertex shader
1574 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1575 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1576
1577 // Setup active mask for vertex shader.
1578 vsContext_lo.mask = GenerateMask(endVertex - i);
1579 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1580
1581 // forward cut mask to the PA
1582 if (IsIndexedT::value)
1583 {
1584 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1585 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1586 }
1587
1588 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1589
1590 #if KNOB_ENABLE_TOSS_POINTS
1591 if (!KNOB_TOSS_FETCH)
1592 #endif
1593 {
1594 AR_BEGIN(FEVertexShader, pDC->drawId);
1595 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1596
1597 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1598 {
1599 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1600 }
1601 AR_END(FEVertexShader, 0);
1602
1603 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1604 }
1605 }
1606
1607 // 2. Assemble primitives given the last two SIMD.
1608 do
1609 {
1610 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1611
1612 RDTSC_START(FEPAAssemble);
1613 bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1614 RDTSC_STOP(FEPAAssemble, 1, 0);
1615
1616 #if KNOB_ENABLE_TOSS_POINTS
1617 if (!KNOB_TOSS_FETCH)
1618 #endif
1619 {
1620 #if KNOB_ENABLE_TOSS_POINTS
1621 if (!KNOB_TOSS_VS)
1622 #endif
1623 {
1624 if (assemble)
1625 {
1626 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1627
1628 const uint32_t numPrims = pa.NumPrims();
1629 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1630 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1631
1632 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1633 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1634 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1635
1636 if (HasTessellationT::value)
1637 {
1638 pa.useAlternateOffset = false;
1639 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1640
1641 if (numPrims_hi)
1642 {
1643 pa.useAlternateOffset = true;
1644 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1645 }
1646 }
1647 else if (HasGeometryShaderT::value)
1648 {
1649 pa.useAlternateOffset = false;
1650 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1651
1652 if (numPrims_hi)
1653 {
1654 pa.useAlternateOffset = true;
1655 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1656 }
1657 }
1658 else
1659 {
1660 // If streamout is enabled then stream vertices out to memory.
1661 if (HasStreamOutT::value)
1662 {
1663 pa.useAlternateOffset = false;
1664 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1665 }
1666
1667 if (HasRastT::value)
1668 {
1669 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1670
1671 pa.useAlternateOffset = false;
1672 pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
1673 }
1674 }
1675 }
1676 }
1677 }
1678 } while (pa.NextPrim());
1679
1680 if (IsIndexedT::value)
1681 {
1682 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1683 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1684 }
1685 else
1686 {
1687 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1688 }
1689
1690 i += KNOB_SIMD16_WIDTH;
1691 }
1692
1693 pa.Reset();
1694 }
1695
1696 #else
1697 SWR_VS_CONTEXT vsContext;
1698 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1699
1700 fetchInfo.pStreams = &state.vertexBuffers[0];
1701 fetchInfo.StartInstance = work.startInstance;
1702 fetchInfo.StartVertex = 0;
1703
1704 if (IsIndexedT::value)
1705 {
1706 fetchInfo.BaseVertex = work.baseVertex;
1707
1708 // if the entire index buffer isn't being consumed, set the last index
1709 // so that fetches < a SIMD wide will be masked off
1710 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1711 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1712 {
1713 fetchInfo.pLastIndex = pLastRequestedIndex;
1714 }
1715 }
1716 else
1717 {
1718 fetchInfo.StartVertex = work.startVertex;
1719 }
1720
1721 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1722
1723 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1724 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1725 {
1726 simdscalari vIndex;
1727 uint32_t i = 0;
1728
1729 if (IsIndexedT::value)
1730 {
1731 fetchInfo.pIndices = work.pIB;
1732 }
1733 else
1734 {
1735 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1736 fetchInfo.pIndices = (const int32_t*)&vIndex;
1737 }
1738
1739 fetchInfo.CurInstance = instanceNum;
1740 vsContext.InstanceID = instanceNum;
1741
1742 while (pa.HasWork())
1743 {
1744 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1745 // So we need to keep this outside of (i < endVertex) check.
1746 simdmask* pvCutIndices = nullptr;
1747 if (IsIndexedT::value)
1748 {
1749 pvCutIndices = &pa.GetNextVsIndices();
1750 }
1751
1752 simdvertex& vout = pa.GetNextVsOutput();
1753 vsContext.pVin = &vout;
1754 vsContext.pVout = &vout;
1755
1756 if (i < endVertex)
1757 {
1758
1759 // 1. Execute FS/VS for a single SIMD.
1760 AR_BEGIN(FEFetchShader, pDC->drawId);
1761 state.pfnFetchFunc(fetchInfo, vout);
1762 AR_END(FEFetchShader, 0);
1763
1764 // forward fetch generated vertex IDs to the vertex shader
1765 vsContext.VertexID = fetchInfo.VertexID;
1766
1767 // Setup active mask for vertex shader.
1768 vsContext.mask = GenerateMask(endVertex - i);
1769
1770 // forward cut mask to the PA
1771 if (IsIndexedT::value)
1772 {
1773 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1774 }
1775
1776 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1777
1778 #if KNOB_ENABLE_TOSS_POINTS
1779 if (!KNOB_TOSS_FETCH)
1780 #endif
1781 {
1782 AR_BEGIN(FEVertexShader, pDC->drawId);
1783 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1784 AR_END(FEVertexShader, 0);
1785
1786 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1787 }
1788 }
1789
1790 // 2. Assemble primitives given the last two SIMD.
1791 do
1792 {
1793 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1794 // PaAssemble returns false if there is not enough verts to assemble.
1795 AR_BEGIN(FEPAAssemble, pDC->drawId);
1796 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1797 AR_END(FEPAAssemble, 1);
1798
1799 #if KNOB_ENABLE_TOSS_POINTS
1800 if (!KNOB_TOSS_FETCH)
1801 #endif
1802 {
1803 #if KNOB_ENABLE_TOSS_POINTS
1804 if (!KNOB_TOSS_VS)
1805 #endif
1806 {
1807 if (assemble)
1808 {
1809 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1810
1811 if (HasTessellationT::value)
1812 {
1813 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1814 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1815 }
1816 else if (HasGeometryShaderT::value)
1817 {
1818 GeometryShaderStage<HasStreamOutT, HasRastT>(
1819 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1820 }
1821 else
1822 {
1823 // If streamout is enabled then stream vertices out to memory.
1824 if (HasStreamOutT::value)
1825 {
1826 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1827 }
1828
1829 if (HasRastT::value)
1830 {
1831 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1832
1833 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1834 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1835 }
1836 }
1837 }
1838 }
1839 }
1840 } while (pa.NextPrim());
1841
1842 if (IsIndexedT::value)
1843 {
1844 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1845 }
1846 else
1847 {
1848 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1849 }
1850
1851 i += KNOB_SIMD_WIDTH;
1852 }
1853 pa.Reset();
1854 }
1855
1856 #endif
1857
1858 AR_END(FEProcessDraw, numPrims * work.numInstances);
1859 }
1860
1861 struct FEDrawChooser
1862 {
1863 typedef PFN_FE_WORK_FUNC FuncType;
1864
1865 template <typename... ArgsB>
1866 static FuncType GetFunc()
1867 {
1868 return ProcessDraw<ArgsB...>;
1869 }
1870 };
1871
1872
1873 // Selector for correct templated Draw front-end function
1874 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1875 bool IsIndexed,
1876 bool IsCutIndexEnabled,
1877 bool HasTessellation,
1878 bool HasGeometryShader,
1879 bool HasStreamOut,
1880 bool HasRasterization)
1881 {
1882 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1883 }