c150c51199f5b584aef5863dc6218d8a714ea07d
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Helper macro to generate a bitmask
45 static INLINE uint32_t GenMask(uint32_t numBits)
46 {
47 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
48 return ((1U << numBits) - 1);
49 }
50
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief FE handler for SwrSync.
53 /// @param pContext - pointer to SWR context.
54 /// @param pDC - pointer to draw context.
55 /// @param workerId - thread's worker id. Even thread has a unique id.
56 /// @param pUserData - Pointer to user data passed back to sync callback.
57 /// @todo This should go away when we switch this to use compute threading.
58 void ProcessSync(
59 SWR_CONTEXT *pContext,
60 DRAW_CONTEXT *pDC,
61 uint32_t workerId,
62 void *pUserData)
63 {
64 BE_WORK work;
65 work.type = SYNC;
66 work.pfnWork = ProcessSyncBE;
67
68 MacroTileMgr *pTileMgr = pDC->pTileMgr;
69 pTileMgr->enqueue(0, 0, &work);
70 }
71
72 //////////////////////////////////////////////////////////////////////////
73 /// @brief FE handler for SwrDestroyContext.
74 /// @param pContext - pointer to SWR context.
75 /// @param pDC - pointer to draw context.
76 /// @param workerId - thread's worker id. Even thread has a unique id.
77 /// @param pUserData - Pointer to user data passed back to sync callback.
78 void ProcessShutdown(
79 SWR_CONTEXT *pContext,
80 DRAW_CONTEXT *pDC,
81 uint32_t workerId,
82 void *pUserData)
83 {
84 BE_WORK work;
85 work.type = SHUTDOWN;
86 work.pfnWork = ProcessShutdownBE;
87
88 MacroTileMgr *pTileMgr = pDC->pTileMgr;
89 // Enqueue at least 1 work item for each worker thread
90 // account for number of numa nodes
91 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
92
93 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
94 {
95 for (uint32_t n = 0; n < numNumaNodes; ++n)
96 {
97 pTileMgr->enqueue(i, n, &work);
98 }
99 }
100 }
101
102 //////////////////////////////////////////////////////////////////////////
103 /// @brief FE handler for SwrClearRenderTarget.
104 /// @param pContext - pointer to SWR context.
105 /// @param pDC - pointer to draw context.
106 /// @param workerId - thread's worker id. Even thread has a unique id.
107 /// @param pUserData - Pointer to user data passed back to clear callback.
108 /// @todo This should go away when we switch this to use compute threading.
109 void ProcessClear(
110 SWR_CONTEXT *pContext,
111 DRAW_CONTEXT *pDC,
112 uint32_t workerId,
113 void *pUserData)
114 {
115 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
116 MacroTileMgr *pTileMgr = pDC->pTileMgr;
117
118 // queue a clear to each macro tile
119 // compute macro tile bounds for the specified rect
120 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
121 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
122 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
123 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
124
125 BE_WORK work;
126 work.type = CLEAR;
127 work.pfnWork = ProcessClearBE;
128 work.desc.clear = *pDesc;
129
130 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
131 {
132 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
133 {
134 pTileMgr->enqueue(x, y, &work);
135 }
136 }
137 }
138
139 //////////////////////////////////////////////////////////////////////////
140 /// @brief FE handler for SwrStoreTiles.
141 /// @param pContext - pointer to SWR context.
142 /// @param pDC - pointer to draw context.
143 /// @param workerId - thread's worker id. Even thread has a unique id.
144 /// @param pUserData - Pointer to user data passed back to callback.
145 /// @todo This should go away when we switch this to use compute threading.
146 void ProcessStoreTiles(
147 SWR_CONTEXT *pContext,
148 DRAW_CONTEXT *pDC,
149 uint32_t workerId,
150 void *pUserData)
151 {
152 AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
153 MacroTileMgr *pTileMgr = pDC->pTileMgr;
154 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
155
156 // queue a store to each macro tile
157 // compute macro tile bounds for the specified rect
158 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
159 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
160 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
161 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
162
163 // store tiles
164 BE_WORK work;
165 work.type = STORETILES;
166 work.pfnWork = ProcessStoreTilesBE;
167 work.desc.storeTiles = *pDesc;
168
169 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
170 {
171 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
172 {
173 pTileMgr->enqueue(x, y, &work);
174 }
175 }
176
177 AR_END(FEProcessStoreTiles, 0);
178 }
179
180 //////////////////////////////////////////////////////////////////////////
181 /// @brief FE handler for SwrInvalidateTiles.
182 /// @param pContext - pointer to SWR context.
183 /// @param pDC - pointer to draw context.
184 /// @param workerId - thread's worker id. Even thread has a unique id.
185 /// @param pUserData - Pointer to user data passed back to callback.
186 /// @todo This should go away when we switch this to use compute threading.
187 void ProcessDiscardInvalidateTiles(
188 SWR_CONTEXT *pContext,
189 DRAW_CONTEXT *pDC,
190 uint32_t workerId,
191 void *pUserData)
192 {
193 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
194 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
195 MacroTileMgr *pTileMgr = pDC->pTileMgr;
196
197 // compute macro tile bounds for the specified rect
198 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
199 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
200 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
201 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
202
203 if (pDesc->fullTilesOnly == false)
204 {
205 // include partial tiles
206 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
207 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
208 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
209 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
210 }
211
212 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
213 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
214
215 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
216 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
217
218 // load tiles
219 BE_WORK work;
220 work.type = DISCARDINVALIDATETILES;
221 work.pfnWork = ProcessDiscardInvalidateTilesBE;
222 work.desc.discardInvalidateTiles = *pDesc;
223
224 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
225 {
226 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
227 {
228 pTileMgr->enqueue(x, y, &work);
229 }
230 }
231
232 AR_END(FEProcessInvalidateTiles, 0);
233 }
234
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Computes the number of primitives given the number of verts.
237 /// @param mode - primitive topology for draw operation.
238 /// @param numPrims - number of vertices or indices for draw.
239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
240 uint32_t GetNumPrims(
241 PRIMITIVE_TOPOLOGY mode,
242 uint32_t numPrims)
243 {
244 switch (mode)
245 {
246 case TOP_POINT_LIST: return numPrims;
247 case TOP_TRIANGLE_LIST: return numPrims / 3;
248 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
249 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
250 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
251 case TOP_QUAD_LIST: return numPrims / 4;
252 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
253 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
254 case TOP_LINE_LIST: return numPrims / 2;
255 case TOP_LINE_LOOP: return numPrims;
256 case TOP_RECT_LIST: return numPrims / 3;
257 case TOP_LINE_LIST_ADJ: return numPrims / 4;
258 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
259 case TOP_TRI_LIST_ADJ: return numPrims / 6;
260 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
261
262 case TOP_PATCHLIST_1:
263 case TOP_PATCHLIST_2:
264 case TOP_PATCHLIST_3:
265 case TOP_PATCHLIST_4:
266 case TOP_PATCHLIST_5:
267 case TOP_PATCHLIST_6:
268 case TOP_PATCHLIST_7:
269 case TOP_PATCHLIST_8:
270 case TOP_PATCHLIST_9:
271 case TOP_PATCHLIST_10:
272 case TOP_PATCHLIST_11:
273 case TOP_PATCHLIST_12:
274 case TOP_PATCHLIST_13:
275 case TOP_PATCHLIST_14:
276 case TOP_PATCHLIST_15:
277 case TOP_PATCHLIST_16:
278 case TOP_PATCHLIST_17:
279 case TOP_PATCHLIST_18:
280 case TOP_PATCHLIST_19:
281 case TOP_PATCHLIST_20:
282 case TOP_PATCHLIST_21:
283 case TOP_PATCHLIST_22:
284 case TOP_PATCHLIST_23:
285 case TOP_PATCHLIST_24:
286 case TOP_PATCHLIST_25:
287 case TOP_PATCHLIST_26:
288 case TOP_PATCHLIST_27:
289 case TOP_PATCHLIST_28:
290 case TOP_PATCHLIST_29:
291 case TOP_PATCHLIST_30:
292 case TOP_PATCHLIST_31:
293 case TOP_PATCHLIST_32:
294 return numPrims / (mode - TOP_PATCHLIST_BASE);
295
296 case TOP_POLYGON:
297 case TOP_POINT_LIST_BF:
298 case TOP_LINE_STRIP_CONT:
299 case TOP_LINE_STRIP_BF:
300 case TOP_LINE_STRIP_CONT_BF:
301 case TOP_TRIANGLE_FAN_NOSTIPPLE:
302 case TOP_TRI_STRIP_REVERSE:
303 case TOP_PATCHLIST_BASE:
304 case TOP_UNKNOWN:
305 SWR_ASSERT(false, "Unsupported topology: %d", mode);
306 return 0;
307 }
308
309 return 0;
310 }
311
312 //////////////////////////////////////////////////////////////////////////
313 /// @brief Computes the number of verts given the number of primitives.
314 /// @param mode - primitive topology for draw operation.
315 /// @param numPrims - number of primitives for draw.
316 uint32_t GetNumVerts(
317 PRIMITIVE_TOPOLOGY mode,
318 uint32_t numPrims)
319 {
320 switch (mode)
321 {
322 case TOP_POINT_LIST: return numPrims;
323 case TOP_TRIANGLE_LIST: return numPrims * 3;
324 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
325 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
326 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
327 case TOP_QUAD_LIST: return numPrims * 4;
328 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
329 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
330 case TOP_LINE_LIST: return numPrims * 2;
331 case TOP_LINE_LOOP: return numPrims;
332 case TOP_RECT_LIST: return numPrims * 3;
333 case TOP_LINE_LIST_ADJ: return numPrims * 4;
334 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
335 case TOP_TRI_LIST_ADJ: return numPrims * 6;
336 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
337
338 case TOP_PATCHLIST_1:
339 case TOP_PATCHLIST_2:
340 case TOP_PATCHLIST_3:
341 case TOP_PATCHLIST_4:
342 case TOP_PATCHLIST_5:
343 case TOP_PATCHLIST_6:
344 case TOP_PATCHLIST_7:
345 case TOP_PATCHLIST_8:
346 case TOP_PATCHLIST_9:
347 case TOP_PATCHLIST_10:
348 case TOP_PATCHLIST_11:
349 case TOP_PATCHLIST_12:
350 case TOP_PATCHLIST_13:
351 case TOP_PATCHLIST_14:
352 case TOP_PATCHLIST_15:
353 case TOP_PATCHLIST_16:
354 case TOP_PATCHLIST_17:
355 case TOP_PATCHLIST_18:
356 case TOP_PATCHLIST_19:
357 case TOP_PATCHLIST_20:
358 case TOP_PATCHLIST_21:
359 case TOP_PATCHLIST_22:
360 case TOP_PATCHLIST_23:
361 case TOP_PATCHLIST_24:
362 case TOP_PATCHLIST_25:
363 case TOP_PATCHLIST_26:
364 case TOP_PATCHLIST_27:
365 case TOP_PATCHLIST_28:
366 case TOP_PATCHLIST_29:
367 case TOP_PATCHLIST_30:
368 case TOP_PATCHLIST_31:
369 case TOP_PATCHLIST_32:
370 return numPrims * (mode - TOP_PATCHLIST_BASE);
371
372 case TOP_POLYGON:
373 case TOP_POINT_LIST_BF:
374 case TOP_LINE_STRIP_CONT:
375 case TOP_LINE_STRIP_BF:
376 case TOP_LINE_STRIP_CONT_BF:
377 case TOP_TRIANGLE_FAN_NOSTIPPLE:
378 case TOP_TRI_STRIP_REVERSE:
379 case TOP_PATCHLIST_BASE:
380 case TOP_UNKNOWN:
381 SWR_ASSERT(false, "Unsupported topology: %d", mode);
382 return 0;
383 }
384
385 return 0;
386 }
387
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Return number of verts per primitive.
390 /// @param topology - topology
391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
393 {
394 uint32_t numVerts = 0;
395 switch (topology)
396 {
397 case TOP_POINT_LIST:
398 case TOP_POINT_LIST_BF:
399 numVerts = 1;
400 break;
401 case TOP_LINE_LIST:
402 case TOP_LINE_STRIP:
403 case TOP_LINE_LIST_ADJ:
404 case TOP_LINE_LOOP:
405 case TOP_LINE_STRIP_CONT:
406 case TOP_LINE_STRIP_BF:
407 case TOP_LISTSTRIP_ADJ:
408 numVerts = 2;
409 break;
410 case TOP_TRIANGLE_LIST:
411 case TOP_TRIANGLE_STRIP:
412 case TOP_TRIANGLE_FAN:
413 case TOP_TRI_LIST_ADJ:
414 case TOP_TRI_STRIP_ADJ:
415 case TOP_TRI_STRIP_REVERSE:
416 case TOP_RECT_LIST:
417 numVerts = 3;
418 break;
419 case TOP_QUAD_LIST:
420 case TOP_QUAD_STRIP:
421 numVerts = 4;
422 break;
423 case TOP_PATCHLIST_1:
424 case TOP_PATCHLIST_2:
425 case TOP_PATCHLIST_3:
426 case TOP_PATCHLIST_4:
427 case TOP_PATCHLIST_5:
428 case TOP_PATCHLIST_6:
429 case TOP_PATCHLIST_7:
430 case TOP_PATCHLIST_8:
431 case TOP_PATCHLIST_9:
432 case TOP_PATCHLIST_10:
433 case TOP_PATCHLIST_11:
434 case TOP_PATCHLIST_12:
435 case TOP_PATCHLIST_13:
436 case TOP_PATCHLIST_14:
437 case TOP_PATCHLIST_15:
438 case TOP_PATCHLIST_16:
439 case TOP_PATCHLIST_17:
440 case TOP_PATCHLIST_18:
441 case TOP_PATCHLIST_19:
442 case TOP_PATCHLIST_20:
443 case TOP_PATCHLIST_21:
444 case TOP_PATCHLIST_22:
445 case TOP_PATCHLIST_23:
446 case TOP_PATCHLIST_24:
447 case TOP_PATCHLIST_25:
448 case TOP_PATCHLIST_26:
449 case TOP_PATCHLIST_27:
450 case TOP_PATCHLIST_28:
451 case TOP_PATCHLIST_29:
452 case TOP_PATCHLIST_30:
453 case TOP_PATCHLIST_31:
454 case TOP_PATCHLIST_32:
455 numVerts = topology - TOP_PATCHLIST_BASE;
456 break;
457 default:
458 SWR_ASSERT(false, "Unsupported topology: %d", topology);
459 break;
460 }
461
462 if (includeAdjVerts)
463 {
464 switch (topology)
465 {
466 case TOP_LISTSTRIP_ADJ:
467 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
468 case TOP_TRI_STRIP_ADJ:
469 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
470 default: break;
471 }
472 }
473
474 return numVerts;
475 }
476
477 //////////////////////////////////////////////////////////////////////////
478 /// @brief Generate mask from remaining work.
479 /// @param numWorkItems - Number of items being worked on by a SIMD.
480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
481 {
482 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
483 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
484 return _simd_castps_si(vMask(mask));
485 }
486
487 //////////////////////////////////////////////////////////////////////////
488 /// @brief StreamOut - Streams vertex data out to SO buffers.
489 /// Generally, we are only streaming out a SIMDs worth of triangles.
490 /// @param pDC - pointer to draw context.
491 /// @param workerId - thread's worker id. Even thread has a unique id.
492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
493 static void StreamOut(
494 DRAW_CONTEXT* pDC,
495 PA_STATE& pa,
496 uint32_t workerId,
497 uint32_t* pPrimData,
498 uint32_t streamIndex)
499 {
500 SWR_CONTEXT *pContext = pDC->pContext;
501
502 AR_BEGIN(FEStreamout, pDC->drawId);
503
504 const API_STATE& state = GetApiState(pDC);
505 const SWR_STREAMOUT_STATE &soState = state.soState;
506
507 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
508
509 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
510 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
511
512 SWR_STREAMOUT_CONTEXT soContext = { 0 };
513
514 // Setup buffer state pointers.
515 for (uint32_t i = 0; i < 4; ++i)
516 {
517 soContext.pBuffer[i] = &state.soBuffer[i];
518 }
519
520 uint32_t numPrims = pa.NumPrims();
521 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
522 {
523 DWORD slot = 0;
524 uint32_t soMask = soState.streamMasks[streamIndex];
525
526 // Write all entries into primitive data buffer for SOS.
527 while (_BitScanForward(&slot, soMask))
528 {
529 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
530 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
531 pa.AssembleSingle(paSlot, primIndex, attrib);
532
533 // Attribute offset is relative offset from start of vertex.
534 // Note that attributes start at slot 1 in the PA buffer. We need to write this
535 // to prim data starting at slot 0. Which is why we do (slot - 1).
536 // Also note: GL works slightly differently, and needs slot 0
537 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
538
539 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
540 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
541 {
542 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
543
544 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
545 }
546 soMask &= ~(1 << slot);
547 }
548
549 // Update pPrimData pointer
550 soContext.pPrimData = pPrimData;
551
552 // Call SOS
553 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
554 state.pfnSoFunc[streamIndex](soContext);
555 }
556
557 // Update SO write offset. The driver provides memory for the update.
558 for (uint32_t i = 0; i < 4; ++i)
559 {
560 if (state.soBuffer[i].pWriteOffset)
561 {
562 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
563 }
564
565 if (state.soBuffer[i].soWriteEnable)
566 {
567 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
568 pDC->dynState.SoWriteOffsetDirty[i] = true;
569 }
570 }
571
572 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
573 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
574
575 AR_END(FEStreamout, 1);
576 }
577
578 //////////////////////////////////////////////////////////////////////////
579 /// @brief Computes number of invocations. The current index represents
580 /// the start of the SIMD. The max index represents how much work
581 /// items are remaining. If there is less then a SIMD's xmin of work
582 /// then return the remaining amount of work.
583 /// @param curIndex - The start index for the SIMD.
584 /// @param maxIndex - The last index for all work items.
585 static INLINE uint32_t GetNumInvocations(
586 uint32_t curIndex,
587 uint32_t maxIndex)
588 {
589 uint32_t remainder = (maxIndex - curIndex);
590 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
591 }
592
593 //////////////////////////////////////////////////////////////////////////
594 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
595 /// The geometry shader will loop over each active streamout buffer, assembling
596 /// primitives for the downstream stages. When multistream output is enabled,
597 /// the generated stream ID buffer from the GS needs to be converted to a cut
598 /// buffer for the primitive assembler.
599 /// @param stream - stream id to generate the cut buffer for
600 /// @param pStreamIdBase - pointer to the stream ID buffer
601 /// @param numEmittedVerts - Number of total verts emitted by the GS
602 /// @param pCutBuffer - output buffer to write cuts to
603 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
604 {
605 SWR_ASSERT(stream < MAX_SO_STREAMS);
606
607 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
608 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
609
610 for (uint32_t b = 0; b < numOutputBytes; ++b)
611 {
612 uint8_t curInputByte = pStreamIdBase[2*b];
613 uint8_t outByte = 0;
614 for (uint32_t i = 0; i < 4; ++i)
615 {
616 if ((curInputByte & 0x3) != stream)
617 {
618 outByte |= (1 << i);
619 }
620 curInputByte >>= 2;
621 }
622
623 curInputByte = pStreamIdBase[2 * b + 1];
624 for (uint32_t i = 0; i < 4; ++i)
625 {
626 if ((curInputByte & 0x3) != stream)
627 {
628 outByte |= (1 << (i + 4));
629 }
630 curInputByte >>= 2;
631 }
632
633 *pCutBuffer++ = outByte;
634 }
635 }
636
637 THREAD SWR_GS_CONTEXT tlsGsContext;
638
639 //////////////////////////////////////////////////////////////////////////
640 /// @brief Implements GS stage.
641 /// @param pDC - pointer to draw context.
642 /// @param workerId - thread's worker id. Even thread has a unique id.
643 /// @param pa - The primitive assembly object.
644 /// @param pGsOut - output stream for GS
645 template <
646 typename HasStreamOutT,
647 typename HasRastT>
648 static void GeometryShaderStage(
649 DRAW_CONTEXT *pDC,
650 uint32_t workerId,
651 PA_STATE& pa,
652 void* pGsOut,
653 void* pCutBuffer,
654 void* pStreamCutBuffer,
655 uint32_t* pSoPrimData,
656 simdscalari primID)
657 {
658 SWR_CONTEXT *pContext = pDC->pContext;
659
660 AR_BEGIN(FEGeometryShader, pDC->drawId);
661
662 const API_STATE& state = GetApiState(pDC);
663 const SWR_GS_STATE* pState = &state.gsState;
664
665 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
666 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
667
668 tlsGsContext.pStream = (uint8_t*)pGsOut;
669 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
670 tlsGsContext.PrimitiveID = primID;
671
672 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
673 simdvector attrib[MAX_ATTRIBUTES];
674
675 // assemble all attributes for the input primitive
676 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
677 {
678 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
679 pa.Assemble(attribSlot, attrib);
680
681 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
682 {
683 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
684 }
685 }
686
687 // assemble position
688 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
689 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
690 {
691 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
692 }
693
694 const uint32_t vertexStride = sizeof(simdvertex);
695 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
696 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
697 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
698 uint32_t cutPrimStride;
699 uint32_t cutInstanceStride;
700
701 if (pState->isSingleStream)
702 {
703 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
704 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
705 }
706 else
707 {
708 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
709 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
710 }
711
712 // record valid prims from the frontend to avoid over binning the newly generated
713 // prims from the GS
714 uint32_t numInputPrims = pa.NumPrims();
715
716 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
717 {
718 tlsGsContext.InstanceID = instance;
719 tlsGsContext.mask = GenerateMask(numInputPrims);
720
721 // execute the geometry shader
722 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
723
724 tlsGsContext.pStream += instanceStride;
725 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
726 }
727
728 // set up new binner and state for the GS output topology
729 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
730 if (HasRastT::value)
731 {
732 switch (pState->outputTopology)
733 {
734 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
735 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
736 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
737 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
738 }
739 }
740
741 // foreach input prim:
742 // - setup a new PA based on the emitted verts for that prim
743 // - loop over the new verts, calling PA to assemble each prim
744 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
745 uint32_t* pPrimitiveId = (uint32_t*)&primID;
746
747 uint32_t totalPrimsGenerated = 0;
748 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
749 {
750 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
751 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
752 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
753 {
754 uint32_t numEmittedVerts = pVertexCount[inputPrim];
755 if (numEmittedVerts == 0)
756 {
757 continue;
758 }
759
760 uint8_t* pBase = pInstanceBase + instance * instanceStride;
761 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
762
763 uint32_t numAttribs = state.feNumAttributes;
764
765 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
766 {
767 bool processCutVerts = false;
768
769 uint8_t* pCutBuffer = pCutBase;
770
771 // assign default stream ID, only relevant when GS is outputting a single stream
772 uint32_t streamID = 0;
773 if (pState->isSingleStream)
774 {
775 processCutVerts = true;
776 streamID = pState->singleStreamID;
777 if (streamID != stream) continue;
778 }
779 else
780 {
781 // early exit if this stream is not enabled for streamout
782 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
783 {
784 continue;
785 }
786
787 // multi-stream output, need to translate StreamID buffer to a cut buffer
788 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
789 pCutBuffer = (uint8_t*)pStreamCutBuffer;
790 processCutVerts = false;
791 }
792
793 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
794
795 while (gsPa.GetNextStreamOutput())
796 {
797 do
798 {
799 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
800
801 if (assemble)
802 {
803 totalPrimsGenerated += gsPa.NumPrims();
804
805 if (HasStreamOutT::value)
806 {
807 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
808 }
809
810 if (HasRastT::value && state.soState.streamToRasterizer == stream)
811 {
812 simdscalari vPrimId;
813 // pull primitiveID from the GS output if available
814 if (state.gsState.emitsPrimitiveID)
815 {
816 simdvector primIdAttrib[3];
817 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
818 vPrimId = _simd_castps_si(primIdAttrib[0].x);
819 }
820 else
821 {
822 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
823 }
824
825 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
826 simdscalari vViewPortIdx;
827 if (state.gsState.emitsViewportArrayIndex)
828 {
829 simdvector vpiAttrib[3];
830 gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
831
832 // OOB indices => forced to zero.
833 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
834 simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
835 vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
836
837 vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
838 }
839 else
840 {
841 vViewPortIdx = _simd_set1_epi32(0);
842 }
843
844 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
845 }
846 }
847 } while (gsPa.NextPrim());
848 }
849 }
850 }
851 }
852
853 // update GS pipeline stats
854 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
855 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
856 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
857 AR_END(FEGeometryShader, 1);
858 }
859
860 //////////////////////////////////////////////////////////////////////////
861 /// @brief Allocate GS buffers
862 /// @param pDC - pointer to draw context.
863 /// @param state - API state
864 /// @param ppGsOut - pointer to GS output buffer allocation
865 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
866 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
867 void **ppStreamCutBuffer)
868 {
869 auto pArena = pDC->pArena;
870 SWR_ASSERT(pArena != nullptr);
871 SWR_ASSERT(state.gsState.gsEnable);
872 // allocate arena space to hold GS output verts
873 // @todo pack attribs
874 // @todo support multiple streams
875 const uint32_t vertexStride = sizeof(simdvertex);
876 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
877 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
878 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
879
880 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
881 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
882 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
883 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
884
885 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
886 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
887
888 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
889 if (state.gsState.isSingleStream)
890 {
891 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
892 *ppStreamCutBuffer = nullptr;
893 }
894 else
895 {
896 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
897 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
898 }
899
900 }
901
902 //////////////////////////////////////////////////////////////////////////
903 /// @brief Contains all data generated by the HS and passed to the
904 /// tessellator and DS.
905 struct TessellationThreadLocalData
906 {
907 SWR_HS_CONTEXT hsContext;
908 ScalarPatch patchData[KNOB_SIMD_WIDTH];
909 void* pTxCtx;
910 size_t tsCtxSize;
911
912 simdscalar* pDSOutput;
913 size_t numDSOutputVectors;
914 };
915
916 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
917
918 //////////////////////////////////////////////////////////////////////////
919 /// @brief Allocate tessellation data for this worker thread.
920 INLINE
921 static void AllocateTessellationData(SWR_CONTEXT* pContext)
922 {
923 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
924 if (gt_pTessellationThreadData == nullptr)
925 {
926 gt_pTessellationThreadData = (TessellationThreadLocalData*)
927 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
928 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
929 }
930 }
931
932 //////////////////////////////////////////////////////////////////////////
933 /// @brief Implements Tessellation Stages.
934 /// @param pDC - pointer to draw context.
935 /// @param workerId - thread's worker id. Even thread has a unique id.
936 /// @param pa - The primitive assembly object.
937 /// @param pGsOut - output stream for GS
938 template <
939 typename HasGeometryShaderT,
940 typename HasStreamOutT,
941 typename HasRastT>
942 static void TessellationStages(
943 DRAW_CONTEXT *pDC,
944 uint32_t workerId,
945 PA_STATE& pa,
946 void* pGsOut,
947 void* pCutBuffer,
948 void* pCutStreamBuffer,
949 uint32_t* pSoPrimData,
950 simdscalari primID)
951 {
952 SWR_CONTEXT *pContext = pDC->pContext;
953 const API_STATE& state = GetApiState(pDC);
954 const SWR_TS_STATE& tsState = state.tsState;
955
956 SWR_ASSERT(gt_pTessellationThreadData);
957
958 HANDLE tsCtx = TSInitCtx(
959 tsState.domain,
960 tsState.partitioning,
961 tsState.tsOutputTopology,
962 gt_pTessellationThreadData->pTxCtx,
963 gt_pTessellationThreadData->tsCtxSize);
964 if (tsCtx == nullptr)
965 {
966 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
967 tsCtx = TSInitCtx(
968 tsState.domain,
969 tsState.partitioning,
970 tsState.tsOutputTopology,
971 gt_pTessellationThreadData->pTxCtx,
972 gt_pTessellationThreadData->tsCtxSize);
973 }
974 SWR_ASSERT(tsCtx);
975
976 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
977 if (HasRastT::value)
978 {
979 switch (tsState.postDSTopology)
980 {
981 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
982 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
983 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
984 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
985 }
986 }
987
988 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
989 hsContext.pCPout = gt_pTessellationThreadData->patchData;
990 hsContext.PrimitiveID = primID;
991
992 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
993 // Max storage for one attribute for an entire simdprimitive
994 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
995
996 // assemble all attributes for the input primitives
997 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
998 {
999 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1000 pa.Assemble(attribSlot, simdattrib);
1001
1002 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1003 {
1004 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1005 }
1006 }
1007
1008 #if defined(_DEBUG)
1009 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1010 #endif
1011
1012 uint32_t numPrims = pa.NumPrims();
1013 hsContext.mask = GenerateMask(numPrims);
1014
1015 // Run the HS
1016 AR_BEGIN(FEHullShader, pDC->drawId);
1017 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1018 AR_END(FEHullShader, 0);
1019
1020 UPDATE_STAT_FE(HsInvocations, numPrims);
1021
1022 const uint32_t* pPrimId = (const uint32_t*)&primID;
1023
1024 for (uint32_t p = 0; p < numPrims; ++p)
1025 {
1026 // Run Tessellator
1027 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1028 AR_BEGIN(FETessellation, pDC->drawId);
1029 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1030 AR_EVENT(TessPrimCount(1));
1031 AR_END(FETessellation, 0);
1032
1033 if (tsData.NumPrimitives == 0)
1034 {
1035 continue;
1036 }
1037 SWR_ASSERT(tsData.NumDomainPoints);
1038
1039 // Allocate DS Output memory
1040 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1041 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1042 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1043 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1044 {
1045 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1046 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1047 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1048 }
1049 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1050 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1051
1052 #if defined(_DEBUG)
1053 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1054 #endif
1055
1056 // Run Domain Shader
1057 SWR_DS_CONTEXT dsContext;
1058 dsContext.PrimitiveID = pPrimId[p];
1059 dsContext.pCpIn = &hsContext.pCPout[p];
1060 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1061 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1062 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1063 dsContext.vectorStride = requiredDSVectorInvocations;
1064
1065 uint32_t dsInvocations = 0;
1066
1067 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1068 {
1069 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1070
1071 AR_BEGIN(FEDomainShader, pDC->drawId);
1072 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1073 AR_END(FEDomainShader, 0);
1074
1075 dsInvocations += KNOB_SIMD_WIDTH;
1076 }
1077 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1078
1079 PA_TESS tessPa(
1080 pDC,
1081 dsContext.pOutputData,
1082 dsContext.vectorStride,
1083 tsState.numDsOutputAttribs,
1084 tsData.ppIndices,
1085 tsData.NumPrimitives,
1086 tsState.postDSTopology);
1087
1088 while (tessPa.HasWork())
1089 {
1090 if (HasGeometryShaderT::value)
1091 {
1092 GeometryShaderStage<HasStreamOutT, HasRastT>(
1093 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1094 _simd_set1_epi32(dsContext.PrimitiveID));
1095 }
1096 else
1097 {
1098 if (HasStreamOutT::value)
1099 {
1100 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1101 }
1102
1103 if (HasRastT::value)
1104 {
1105 simdvector prim[3]; // Only deal with triangles, lines, or points
1106 AR_BEGIN(FEPAAssemble, pDC->drawId);
1107 #if SWR_ENABLE_ASSERTS
1108 bool assemble =
1109 #endif
1110 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1111 AR_END(FEPAAssemble, 1);
1112 SWR_ASSERT(assemble);
1113
1114 SWR_ASSERT(pfnClipFunc);
1115 pfnClipFunc(pDC, tessPa, workerId, prim,
1116 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1117 }
1118 }
1119
1120 tessPa.NextPrim();
1121
1122 } // while (tessPa.HasWork())
1123 } // for (uint32_t p = 0; p < numPrims; ++p)
1124
1125 TSDestroyCtx(tsCtx);
1126 }
1127
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief FE handler for SwrDraw.
1130 /// @tparam IsIndexedT - Is indexed drawing enabled
1131 /// @tparam HasTessellationT - Is tessellation enabled
1132 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1133 /// @tparam HasStreamOutT - Is stream-out enabled
1134 /// @tparam HasRastT - Is rasterization enabled
1135 /// @param pContext - pointer to SWR context.
1136 /// @param pDC - pointer to draw context.
1137 /// @param workerId - thread's worker id.
1138 /// @param pUserData - Pointer to DRAW_WORK
1139 template <
1140 typename IsIndexedT,
1141 typename IsCutIndexEnabledT,
1142 typename HasTessellationT,
1143 typename HasGeometryShaderT,
1144 typename HasStreamOutT,
1145 typename HasRastT>
1146 void ProcessDraw(
1147 SWR_CONTEXT *pContext,
1148 DRAW_CONTEXT *pDC,
1149 uint32_t workerId,
1150 void *pUserData)
1151 {
1152
1153 #if KNOB_ENABLE_TOSS_POINTS
1154 if (KNOB_TOSS_QUEUE_FE)
1155 {
1156 return;
1157 }
1158 #endif
1159
1160 AR_BEGIN(FEProcessDraw, pDC->drawId);
1161
1162 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1163 const API_STATE& state = GetApiState(pDC);
1164
1165 uint32_t indexSize = 0;
1166 uint32_t endVertex = work.numVerts;
1167
1168 const int32_t* pLastRequestedIndex = nullptr;
1169 if (IsIndexedT::value)
1170 {
1171 switch (work.type)
1172 {
1173 case R32_UINT:
1174 indexSize = sizeof(uint32_t);
1175 pLastRequestedIndex = &(work.pIB[endVertex]);
1176 break;
1177 case R16_UINT:
1178 indexSize = sizeof(uint16_t);
1179 // nasty address offset to last index
1180 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1181 break;
1182 case R8_UINT:
1183 indexSize = sizeof(uint8_t);
1184 // nasty address offset to last index
1185 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1186 break;
1187 default:
1188 SWR_ASSERT(0);
1189 }
1190 }
1191 else
1192 {
1193 // No cuts, prune partial primitives.
1194 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1195 }
1196
1197 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1198 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1199 #endif
1200
1201 void* pGsOut = nullptr;
1202 void* pCutBuffer = nullptr;
1203 void* pStreamCutBuffer = nullptr;
1204 if (HasGeometryShaderT::value)
1205 {
1206 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1207 }
1208
1209 if (HasTessellationT::value)
1210 {
1211 SWR_ASSERT(state.tsState.tsEnable == true);
1212 SWR_ASSERT(state.pfnHsFunc != nullptr);
1213 SWR_ASSERT(state.pfnDsFunc != nullptr);
1214
1215 AllocateTessellationData(pContext);
1216 }
1217 else
1218 {
1219 SWR_ASSERT(state.tsState.tsEnable == false);
1220 SWR_ASSERT(state.pfnHsFunc == nullptr);
1221 SWR_ASSERT(state.pfnDsFunc == nullptr);
1222 }
1223
1224 // allocate space for streamout input prim data
1225 uint32_t* pSoPrimData = nullptr;
1226 if (HasStreamOutT::value)
1227 {
1228 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1229 }
1230
1231 // choose primitive assembler
1232 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1233 PA_STATE& pa = paFactory.GetPA();
1234
1235 #if USE_SIMD16_FRONTEND
1236 simdvertex vin_lo;
1237 simdvertex vin_hi;
1238 SWR_VS_CONTEXT vsContext_lo;
1239 SWR_VS_CONTEXT vsContext_hi;
1240
1241 vsContext_lo.pVin = &vin_lo;
1242 vsContext_hi.pVin = &vin_hi;
1243
1244 SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
1245
1246 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1247 fetchInfo_lo.StartInstance = work.startInstance;
1248 fetchInfo_lo.StartVertex = 0;
1249
1250 if (IsIndexedT::value)
1251 {
1252 fetchInfo_lo.BaseVertex = work.baseVertex;
1253
1254 // if the entire index buffer isn't being consumed, set the last index
1255 // so that fetches < a SIMD wide will be masked off
1256 fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1257 if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1258 {
1259 fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1260 }
1261 }
1262 else
1263 {
1264 fetchInfo_lo.StartVertex = work.startVertex;
1265 }
1266
1267 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1268
1269 const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1270
1271 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1272 {
1273 uint32_t i = 0;
1274
1275 simd16scalari vIndex;
1276
1277 if (IsIndexedT::value)
1278 {
1279 fetchInfo_lo.pIndices = work.pIB;
1280 fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH
1281 }
1282 else
1283 {
1284 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1285
1286 fetchInfo_lo.pIndices = (const int32_t *)&vIndex.lo;
1287 fetchInfo_hi.pIndices = (const int32_t *)&vIndex.hi;
1288 }
1289
1290 fetchInfo_lo.CurInstance = instanceNum;
1291 fetchInfo_hi.CurInstance = instanceNum;
1292
1293 vsContext_lo.InstanceID = instanceNum;
1294 vsContext_hi.InstanceID = instanceNum;
1295
1296 while (pa.HasWork())
1297 {
1298 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1299 // So we need to keep this outside of (i < endVertex) check.
1300
1301 simdmask *pvCutIndices_lo = nullptr;
1302 simdmask *pvCutIndices_hi = nullptr;
1303
1304 if (IsIndexedT::value)
1305 {
1306 // simd16mask <=> simdmask[2]
1307
1308 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1309 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1310 }
1311
1312 simdvertex vout_lo;
1313 simdvertex vout_hi;
1314
1315 vsContext_lo.pVout = &vout_lo;
1316 vsContext_hi.pVout = &vout_hi;
1317
1318 simd16vertex &vout = pa.GetNextVsOutput();
1319
1320 if (i < endVertex)
1321 {
1322 // 1. Execute FS/VS for a single SIMD.
1323 AR_BEGIN(FEFetchShader, pDC->drawId);
1324 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
1325 if ((i + KNOB_SIMD_WIDTH) < endVertex)
1326 {
1327 state.pfnFetchFunc(fetchInfo_hi, vin_hi);
1328 }
1329 AR_END(FEFetchShader, 0);
1330
1331 // forward fetch generated vertex IDs to the vertex shader
1332 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1333 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1334
1335 // Setup active mask for vertex shader.
1336 vsContext_lo.mask = GenerateMask(endVertex - i);
1337 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1338
1339 // forward cut mask to the PA
1340 if (IsIndexedT::value)
1341 {
1342 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1343 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1344 }
1345
1346 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1347
1348 #if KNOB_ENABLE_TOSS_POINTS
1349 if (!KNOB_TOSS_FETCH)
1350 #endif
1351 {
1352 AR_BEGIN(FEVertexShader, pDC->drawId);
1353 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1354
1355 // copy SIMD vout_lo to lo part of SIMD16 vout
1356 {
1357 const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes;
1358
1359 for (uint32_t i = 0; i < voutNumSlots; i += 1)
1360 {
1361 for (uint32_t j = 0; j < 4; j += 1)
1362 {
1363 vout.attrib[i][j].lo = vout_lo.attrib[i][j];
1364 }
1365 }
1366 }
1367
1368 if ((i + KNOB_SIMD_WIDTH) < endVertex)
1369 {
1370 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1371
1372 // copy SIMD vout_hi to hi part of SIMD16 vout
1373 {
1374 const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes;
1375
1376 for (uint32_t i = 0; i < voutNumSlots; i += 1)
1377 {
1378 for (uint32_t j = 0; j < 4; j += 1)
1379 {
1380 vout.attrib[i][j].hi = vout_hi.attrib[i][j];
1381 }
1382 }
1383 }
1384 }
1385 AR_END(FEVertexShader, 0);
1386
1387 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1388 }
1389 }
1390
1391 // 2. Assemble primitives given the last two SIMD.
1392 do
1393 {
1394 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1395
1396 RDTSC_START(FEPAAssemble);
1397 bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1398 RDTSC_STOP(FEPAAssemble, 1, 0);
1399
1400 #if KNOB_ENABLE_TOSS_POINTS
1401 if (!KNOB_TOSS_FETCH)
1402 #endif
1403 {
1404 #if KNOB_ENABLE_TOSS_POINTS
1405 if (!KNOB_TOSS_VS)
1406 #endif
1407 {
1408 if (assemble)
1409 {
1410 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1411
1412 #if 0
1413 if (HasTessellationT::value)
1414 {
1415 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1416 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1417 }
1418 else if (HasGeometryShaderT::value)
1419 {
1420 GeometryShaderStage<HasStreamOutT, HasRastT>(
1421 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1422 }
1423 else
1424 #endif
1425 {
1426 #if 0
1427 // If streamout is enabled then stream vertices out to memory.
1428 if (HasStreamOutT::value)
1429 {
1430 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1431 }
1432
1433 #endif
1434 if (HasRastT::value)
1435 {
1436 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1437
1438 uint32_t mask = GenMask(pa.NumPrims());
1439 uint32_t mask_lo = mask & 255;
1440 uint32_t mask_hi = (mask >> 8) & 255;
1441
1442 simd16scalari primid = pa.GetPrimID(work.startPrimID);
1443 simdscalari primid_lo = primid.lo;
1444 simdscalari primid_hi = primid.hi;
1445
1446 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1447
1448 for (uint32_t i = 0; i < 3; i += 1)
1449 {
1450 for (uint32_t j = 0; j < 4; j += 1)
1451 {
1452 prim[i][j] = prim_simd16[i][j].lo;
1453 }
1454 }
1455
1456 pa.useAlternateOffset = false;
1457 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_lo, primid_lo, _simd_setzero_si());
1458
1459 if (mask_hi)
1460 {
1461 for (uint32_t i = 0; i < 3; i += 1)
1462 {
1463 for (uint32_t j = 0; j < 4; j += 1)
1464 {
1465 prim[i][j] = prim_simd16[i][j].hi;
1466 }
1467 }
1468
1469 pa.useAlternateOffset = true;
1470 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_hi, primid_hi, _simd_setzero_si());
1471 }
1472 }
1473 }
1474 }
1475 }
1476 }
1477 } while (pa.NextPrim());
1478
1479 if (IsIndexedT::value)
1480 {
1481 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1482 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1483 }
1484 else
1485 {
1486 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1487 }
1488
1489 i += KNOB_SIMD16_WIDTH;
1490 }
1491
1492 pa.Reset();
1493 }
1494
1495 #else
1496 simdvertex vin;
1497 SWR_VS_CONTEXT vsContext;
1498
1499 vsContext.pVin = &vin;
1500
1501 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1502
1503 fetchInfo.pStreams = &state.vertexBuffers[0];
1504 fetchInfo.StartInstance = work.startInstance;
1505 fetchInfo.StartVertex = 0;
1506
1507 if (IsIndexedT::value)
1508 {
1509 fetchInfo.BaseVertex = work.baseVertex;
1510
1511 // if the entire index buffer isn't being consumed, set the last index
1512 // so that fetches < a SIMD wide will be masked off
1513 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1514 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1515 {
1516 fetchInfo.pLastIndex = pLastRequestedIndex;
1517 }
1518 }
1519 else
1520 {
1521 fetchInfo.StartVertex = work.startVertex;
1522 }
1523
1524 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1525
1526 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1527 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1528 {
1529 simdscalari vIndex;
1530 uint32_t i = 0;
1531
1532 if (IsIndexedT::value)
1533 {
1534 fetchInfo.pIndices = work.pIB;
1535 }
1536 else
1537 {
1538 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1539 fetchInfo.pIndices = (const int32_t*)&vIndex;
1540 }
1541
1542 fetchInfo.CurInstance = instanceNum;
1543 vsContext.InstanceID = instanceNum;
1544
1545 while (pa.HasWork())
1546 {
1547 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1548 // So we need to keep this outside of (i < endVertex) check.
1549 simdmask* pvCutIndices = nullptr;
1550 if (IsIndexedT::value)
1551 {
1552 pvCutIndices = &pa.GetNextVsIndices();
1553 }
1554
1555 simdvertex& vout = pa.GetNextVsOutput();
1556 vsContext.pVout = &vout;
1557
1558 if (i < endVertex)
1559 {
1560
1561 // 1. Execute FS/VS for a single SIMD.
1562 AR_BEGIN(FEFetchShader, pDC->drawId);
1563 state.pfnFetchFunc(fetchInfo, vin);
1564 AR_END(FEFetchShader, 0);
1565
1566 // forward fetch generated vertex IDs to the vertex shader
1567 vsContext.VertexID = fetchInfo.VertexID;
1568
1569 // Setup active mask for vertex shader.
1570 vsContext.mask = GenerateMask(endVertex - i);
1571
1572 // forward cut mask to the PA
1573 if (IsIndexedT::value)
1574 {
1575 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1576 }
1577
1578 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1579
1580 #if KNOB_ENABLE_TOSS_POINTS
1581 if (!KNOB_TOSS_FETCH)
1582 #endif
1583 {
1584 AR_BEGIN(FEVertexShader, pDC->drawId);
1585 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1586 AR_END(FEVertexShader, 0);
1587
1588 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1589 }
1590 }
1591
1592 // 2. Assemble primitives given the last two SIMD.
1593 do
1594 {
1595 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1596 // PaAssemble returns false if there is not enough verts to assemble.
1597 AR_BEGIN(FEPAAssemble, pDC->drawId);
1598 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1599 AR_END(FEPAAssemble, 1);
1600
1601 #if KNOB_ENABLE_TOSS_POINTS
1602 if (!KNOB_TOSS_FETCH)
1603 #endif
1604 {
1605 #if KNOB_ENABLE_TOSS_POINTS
1606 if (!KNOB_TOSS_VS)
1607 #endif
1608 {
1609 if (assemble)
1610 {
1611 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1612
1613 if (HasTessellationT::value)
1614 {
1615 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1616 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1617 }
1618 else if (HasGeometryShaderT::value)
1619 {
1620 GeometryShaderStage<HasStreamOutT, HasRastT>(
1621 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1622 }
1623 else
1624 {
1625 // If streamout is enabled then stream vertices out to memory.
1626 if (HasStreamOutT::value)
1627 {
1628 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1629 }
1630
1631 if (HasRastT::value)
1632 {
1633 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1634
1635 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1636 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1637 }
1638 }
1639 }
1640 }
1641 }
1642 } while (pa.NextPrim());
1643
1644 if (IsIndexedT::value)
1645 {
1646 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1647 }
1648 else
1649 {
1650 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1651 }
1652
1653 i += KNOB_SIMD_WIDTH;
1654 }
1655 pa.Reset();
1656 }
1657
1658 #endif
1659
1660 AR_END(FEProcessDraw, numPrims * work.numInstances);
1661 }
1662
1663 struct FEDrawChooser
1664 {
1665 typedef PFN_FE_WORK_FUNC FuncType;
1666
1667 template <typename... ArgsB>
1668 static FuncType GetFunc()
1669 {
1670 return ProcessDraw<ArgsB...>;
1671 }
1672 };
1673
1674
1675 // Selector for correct templated Draw front-end function
1676 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1677 bool IsIndexed,
1678 bool IsCutIndexEnabled,
1679 bool HasTessellation,
1680 bool HasGeometryShader,
1681 bool HasStreamOut,
1682 bool HasRasterization)
1683 {
1684 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1685 }