30c2e7bab517ab599868db2eb34059bcc093417a
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
46 static INLINE uint32_t GenMask(uint32_t numBits)
47 {
48 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
49 return ((1U << numBits) - 1);
50 }
51
52 //////////////////////////////////////////////////////////////////////////
53 /// @brief FE handler for SwrSync.
54 /// @param pContext - pointer to SWR context.
55 /// @param pDC - pointer to draw context.
56 /// @param workerId - thread's worker id. Even thread has a unique id.
57 /// @param pUserData - Pointer to user data passed back to sync callback.
58 /// @todo This should go away when we switch this to use compute threading.
59 void ProcessSync(
60 SWR_CONTEXT *pContext,
61 DRAW_CONTEXT *pDC,
62 uint32_t workerId,
63 void *pUserData)
64 {
65 BE_WORK work;
66 work.type = SYNC;
67 work.pfnWork = ProcessSyncBE;
68
69 MacroTileMgr *pTileMgr = pDC->pTileMgr;
70 pTileMgr->enqueue(0, 0, &work);
71 }
72
73 //////////////////////////////////////////////////////////////////////////
74 /// @brief FE handler for SwrDestroyContext.
75 /// @param pContext - pointer to SWR context.
76 /// @param pDC - pointer to draw context.
77 /// @param workerId - thread's worker id. Even thread has a unique id.
78 /// @param pUserData - Pointer to user data passed back to sync callback.
79 void ProcessShutdown(
80 SWR_CONTEXT *pContext,
81 DRAW_CONTEXT *pDC,
82 uint32_t workerId,
83 void *pUserData)
84 {
85 BE_WORK work;
86 work.type = SHUTDOWN;
87 work.pfnWork = ProcessShutdownBE;
88
89 MacroTileMgr *pTileMgr = pDC->pTileMgr;
90 // Enqueue at least 1 work item for each worker thread
91 // account for number of numa nodes
92 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
93
94 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
95 {
96 for (uint32_t n = 0; n < numNumaNodes; ++n)
97 {
98 pTileMgr->enqueue(i, n, &work);
99 }
100 }
101 }
102
103 //////////////////////////////////////////////////////////////////////////
104 /// @brief FE handler for SwrClearRenderTarget.
105 /// @param pContext - pointer to SWR context.
106 /// @param pDC - pointer to draw context.
107 /// @param workerId - thread's worker id. Even thread has a unique id.
108 /// @param pUserData - Pointer to user data passed back to clear callback.
109 /// @todo This should go away when we switch this to use compute threading.
110 void ProcessClear(
111 SWR_CONTEXT *pContext,
112 DRAW_CONTEXT *pDC,
113 uint32_t workerId,
114 void *pUserData)
115 {
116 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
117 MacroTileMgr *pTileMgr = pDC->pTileMgr;
118
119 // queue a clear to each macro tile
120 // compute macro tile bounds for the specified rect
121 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
122 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
123 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
124 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
125
126 BE_WORK work;
127 work.type = CLEAR;
128 work.pfnWork = ProcessClearBE;
129 work.desc.clear = *pDesc;
130
131 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
132 {
133 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
134 {
135 pTileMgr->enqueue(x, y, &work);
136 }
137 }
138 }
139
140 //////////////////////////////////////////////////////////////////////////
141 /// @brief FE handler for SwrStoreTiles.
142 /// @param pContext - pointer to SWR context.
143 /// @param pDC - pointer to draw context.
144 /// @param workerId - thread's worker id. Even thread has a unique id.
145 /// @param pUserData - Pointer to user data passed back to callback.
146 /// @todo This should go away when we switch this to use compute threading.
147 void ProcessStoreTiles(
148 SWR_CONTEXT *pContext,
149 DRAW_CONTEXT *pDC,
150 uint32_t workerId,
151 void *pUserData)
152 {
153 RDTSC_BEGIN(FEProcessStoreTiles, pDC->drawId);
154 MacroTileMgr *pTileMgr = pDC->pTileMgr;
155 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
156
157 // queue a store to each macro tile
158 // compute macro tile bounds for the specified rect
159 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
160 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
161 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
162 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
163
164 // store tiles
165 BE_WORK work;
166 work.type = STORETILES;
167 work.pfnWork = ProcessStoreTilesBE;
168 work.desc.storeTiles = *pDesc;
169
170 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
171 {
172 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
173 {
174 pTileMgr->enqueue(x, y, &work);
175 }
176 }
177
178 RDTSC_END(FEProcessStoreTiles, 0);
179 }
180
181 //////////////////////////////////////////////////////////////////////////
182 /// @brief FE handler for SwrInvalidateTiles.
183 /// @param pContext - pointer to SWR context.
184 /// @param pDC - pointer to draw context.
185 /// @param workerId - thread's worker id. Even thread has a unique id.
186 /// @param pUserData - Pointer to user data passed back to callback.
187 /// @todo This should go away when we switch this to use compute threading.
188 void ProcessDiscardInvalidateTiles(
189 SWR_CONTEXT *pContext,
190 DRAW_CONTEXT *pDC,
191 uint32_t workerId,
192 void *pUserData)
193 {
194 RDTSC_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
195 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
196 MacroTileMgr *pTileMgr = pDC->pTileMgr;
197
198 // compute macro tile bounds for the specified rect
199 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
200 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
201 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
202 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
203
204 if (pDesc->fullTilesOnly == false)
205 {
206 // include partial tiles
207 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
208 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
209 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
210 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
211 }
212
213 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
214 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
215
216 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
217 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
218
219 // load tiles
220 BE_WORK work;
221 work.type = DISCARDINVALIDATETILES;
222 work.pfnWork = ProcessDiscardInvalidateTilesBE;
223 work.desc.discardInvalidateTiles = *pDesc;
224
225 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
226 {
227 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
228 {
229 pTileMgr->enqueue(x, y, &work);
230 }
231 }
232
233 RDTSC_END(FEProcessInvalidateTiles, 0);
234 }
235
236 //////////////////////////////////////////////////////////////////////////
237 /// @brief Computes the number of primitives given the number of verts.
238 /// @param mode - primitive topology for draw operation.
239 /// @param numPrims - number of vertices or indices for draw.
240 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
241 uint32_t GetNumPrims(
242 PRIMITIVE_TOPOLOGY mode,
243 uint32_t numPrims)
244 {
245 switch (mode)
246 {
247 case TOP_POINT_LIST: return numPrims;
248 case TOP_TRIANGLE_LIST: return numPrims / 3;
249 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
250 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
251 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
252 case TOP_QUAD_LIST: return numPrims / 4;
253 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
254 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
255 case TOP_LINE_LIST: return numPrims / 2;
256 case TOP_LINE_LOOP: return numPrims;
257 case TOP_RECT_LIST: return numPrims / 3;
258 case TOP_LINE_LIST_ADJ: return numPrims / 4;
259 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
260 case TOP_TRI_LIST_ADJ: return numPrims / 6;
261 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
262
263 case TOP_PATCHLIST_1:
264 case TOP_PATCHLIST_2:
265 case TOP_PATCHLIST_3:
266 case TOP_PATCHLIST_4:
267 case TOP_PATCHLIST_5:
268 case TOP_PATCHLIST_6:
269 case TOP_PATCHLIST_7:
270 case TOP_PATCHLIST_8:
271 case TOP_PATCHLIST_9:
272 case TOP_PATCHLIST_10:
273 case TOP_PATCHLIST_11:
274 case TOP_PATCHLIST_12:
275 case TOP_PATCHLIST_13:
276 case TOP_PATCHLIST_14:
277 case TOP_PATCHLIST_15:
278 case TOP_PATCHLIST_16:
279 case TOP_PATCHLIST_17:
280 case TOP_PATCHLIST_18:
281 case TOP_PATCHLIST_19:
282 case TOP_PATCHLIST_20:
283 case TOP_PATCHLIST_21:
284 case TOP_PATCHLIST_22:
285 case TOP_PATCHLIST_23:
286 case TOP_PATCHLIST_24:
287 case TOP_PATCHLIST_25:
288 case TOP_PATCHLIST_26:
289 case TOP_PATCHLIST_27:
290 case TOP_PATCHLIST_28:
291 case TOP_PATCHLIST_29:
292 case TOP_PATCHLIST_30:
293 case TOP_PATCHLIST_31:
294 case TOP_PATCHLIST_32:
295 return numPrims / (mode - TOP_PATCHLIST_BASE);
296
297 case TOP_POLYGON:
298 case TOP_POINT_LIST_BF:
299 case TOP_LINE_STRIP_CONT:
300 case TOP_LINE_STRIP_BF:
301 case TOP_LINE_STRIP_CONT_BF:
302 case TOP_TRIANGLE_FAN_NOSTIPPLE:
303 case TOP_TRI_STRIP_REVERSE:
304 case TOP_PATCHLIST_BASE:
305 case TOP_UNKNOWN:
306 SWR_INVALID("Unsupported topology: %d", mode);
307 return 0;
308 }
309
310 return 0;
311 }
312
313 //////////////////////////////////////////////////////////////////////////
314 /// @brief Computes the number of verts given the number of primitives.
315 /// @param mode - primitive topology for draw operation.
316 /// @param numPrims - number of primitives for draw.
317 uint32_t GetNumVerts(
318 PRIMITIVE_TOPOLOGY mode,
319 uint32_t numPrims)
320 {
321 switch (mode)
322 {
323 case TOP_POINT_LIST: return numPrims;
324 case TOP_TRIANGLE_LIST: return numPrims * 3;
325 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
326 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
327 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
328 case TOP_QUAD_LIST: return numPrims * 4;
329 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
330 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
331 case TOP_LINE_LIST: return numPrims * 2;
332 case TOP_LINE_LOOP: return numPrims;
333 case TOP_RECT_LIST: return numPrims * 3;
334 case TOP_LINE_LIST_ADJ: return numPrims * 4;
335 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
336 case TOP_TRI_LIST_ADJ: return numPrims * 6;
337 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
338
339 case TOP_PATCHLIST_1:
340 case TOP_PATCHLIST_2:
341 case TOP_PATCHLIST_3:
342 case TOP_PATCHLIST_4:
343 case TOP_PATCHLIST_5:
344 case TOP_PATCHLIST_6:
345 case TOP_PATCHLIST_7:
346 case TOP_PATCHLIST_8:
347 case TOP_PATCHLIST_9:
348 case TOP_PATCHLIST_10:
349 case TOP_PATCHLIST_11:
350 case TOP_PATCHLIST_12:
351 case TOP_PATCHLIST_13:
352 case TOP_PATCHLIST_14:
353 case TOP_PATCHLIST_15:
354 case TOP_PATCHLIST_16:
355 case TOP_PATCHLIST_17:
356 case TOP_PATCHLIST_18:
357 case TOP_PATCHLIST_19:
358 case TOP_PATCHLIST_20:
359 case TOP_PATCHLIST_21:
360 case TOP_PATCHLIST_22:
361 case TOP_PATCHLIST_23:
362 case TOP_PATCHLIST_24:
363 case TOP_PATCHLIST_25:
364 case TOP_PATCHLIST_26:
365 case TOP_PATCHLIST_27:
366 case TOP_PATCHLIST_28:
367 case TOP_PATCHLIST_29:
368 case TOP_PATCHLIST_30:
369 case TOP_PATCHLIST_31:
370 case TOP_PATCHLIST_32:
371 return numPrims * (mode - TOP_PATCHLIST_BASE);
372
373 case TOP_POLYGON:
374 case TOP_POINT_LIST_BF:
375 case TOP_LINE_STRIP_CONT:
376 case TOP_LINE_STRIP_BF:
377 case TOP_LINE_STRIP_CONT_BF:
378 case TOP_TRIANGLE_FAN_NOSTIPPLE:
379 case TOP_TRI_STRIP_REVERSE:
380 case TOP_PATCHLIST_BASE:
381 case TOP_UNKNOWN:
382 SWR_INVALID("Unsupported topology: %d", mode);
383 return 0;
384 }
385
386 return 0;
387 }
388
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief Return number of verts per primitive.
391 /// @param topology - topology
392 /// @param includeAdjVerts - include adjacent verts in primitive vertices
393 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
394 {
395 uint32_t numVerts = 0;
396 switch (topology)
397 {
398 case TOP_POINT_LIST:
399 case TOP_POINT_LIST_BF:
400 numVerts = 1;
401 break;
402 case TOP_LINE_LIST:
403 case TOP_LINE_STRIP:
404 case TOP_LINE_LIST_ADJ:
405 case TOP_LINE_LOOP:
406 case TOP_LINE_STRIP_CONT:
407 case TOP_LINE_STRIP_BF:
408 case TOP_LISTSTRIP_ADJ:
409 numVerts = 2;
410 break;
411 case TOP_TRIANGLE_LIST:
412 case TOP_TRIANGLE_STRIP:
413 case TOP_TRIANGLE_FAN:
414 case TOP_TRI_LIST_ADJ:
415 case TOP_TRI_STRIP_ADJ:
416 case TOP_TRI_STRIP_REVERSE:
417 case TOP_RECT_LIST:
418 numVerts = 3;
419 break;
420 case TOP_QUAD_LIST:
421 case TOP_QUAD_STRIP:
422 numVerts = 4;
423 break;
424 case TOP_PATCHLIST_1:
425 case TOP_PATCHLIST_2:
426 case TOP_PATCHLIST_3:
427 case TOP_PATCHLIST_4:
428 case TOP_PATCHLIST_5:
429 case TOP_PATCHLIST_6:
430 case TOP_PATCHLIST_7:
431 case TOP_PATCHLIST_8:
432 case TOP_PATCHLIST_9:
433 case TOP_PATCHLIST_10:
434 case TOP_PATCHLIST_11:
435 case TOP_PATCHLIST_12:
436 case TOP_PATCHLIST_13:
437 case TOP_PATCHLIST_14:
438 case TOP_PATCHLIST_15:
439 case TOP_PATCHLIST_16:
440 case TOP_PATCHLIST_17:
441 case TOP_PATCHLIST_18:
442 case TOP_PATCHLIST_19:
443 case TOP_PATCHLIST_20:
444 case TOP_PATCHLIST_21:
445 case TOP_PATCHLIST_22:
446 case TOP_PATCHLIST_23:
447 case TOP_PATCHLIST_24:
448 case TOP_PATCHLIST_25:
449 case TOP_PATCHLIST_26:
450 case TOP_PATCHLIST_27:
451 case TOP_PATCHLIST_28:
452 case TOP_PATCHLIST_29:
453 case TOP_PATCHLIST_30:
454 case TOP_PATCHLIST_31:
455 case TOP_PATCHLIST_32:
456 numVerts = topology - TOP_PATCHLIST_BASE;
457 break;
458 default:
459 SWR_INVALID("Unsupported topology: %d", topology);
460 break;
461 }
462
463 if (includeAdjVerts)
464 {
465 switch (topology)
466 {
467 case TOP_LISTSTRIP_ADJ:
468 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
469 case TOP_TRI_STRIP_ADJ:
470 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
471 default: break;
472 }
473 }
474
475 return numVerts;
476 }
477
478 //////////////////////////////////////////////////////////////////////////
479 /// @brief Generate mask from remaining work.
480 /// @param numWorkItems - Number of items being worked on by a SIMD.
481 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
482 {
483 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
484 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
485 return _simd_castps_si(_simd_vmask_ps(mask));
486 }
487
488 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
489 {
490 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
491 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
492 return _simd16_castps_si(_simd16_vmask_ps(mask));
493 }
494
495 //////////////////////////////////////////////////////////////////////////
496 /// @brief StreamOut - Streams vertex data out to SO buffers.
497 /// Generally, we are only streaming out a SIMDs worth of triangles.
498 /// @param pDC - pointer to draw context.
499 /// @param workerId - thread's worker id. Even thread has a unique id.
500 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
501 static void StreamOut(
502 DRAW_CONTEXT* pDC,
503 PA_STATE& pa,
504 uint32_t workerId,
505 uint32_t* pPrimData,
506 uint32_t streamIndex)
507 {
508 RDTSC_BEGIN(FEStreamout, pDC->drawId);
509
510 const API_STATE& state = GetApiState(pDC);
511 const SWR_STREAMOUT_STATE &soState = state.soState;
512
513 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
514
515 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
516 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
517
518 SWR_STREAMOUT_CONTEXT soContext = { 0 };
519
520 // Setup buffer state pointers.
521 for (uint32_t i = 0; i < 4; ++i)
522 {
523 soContext.pBuffer[i] = &state.soBuffer[i];
524 }
525
526 uint32_t numPrims = pa.NumPrims();
527
528 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
529 {
530 DWORD slot = 0;
531 uint32_t soMask = soState.streamMasks[streamIndex];
532
533 // Write all entries into primitive data buffer for SOS.
534 while (_BitScanForward(&slot, soMask))
535 {
536 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
537 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
538 pa.AssembleSingle(paSlot, primIndex, attrib);
539
540 // Attribute offset is relative offset from start of vertex.
541 // Note that attributes start at slot 1 in the PA buffer. We need to write this
542 // to prim data starting at slot 0. Which is why we do (slot - 1).
543 // Also note: GL works slightly differently, and needs slot 0
544 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
545
546 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
547 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
548 {
549 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
550
551 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
552 }
553
554 soMask &= ~(1 << slot);
555 }
556
557 // Update pPrimData pointer
558 soContext.pPrimData = pPrimData;
559
560 // Call SOS
561 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
562 state.pfnSoFunc[streamIndex](soContext);
563 }
564
565 // Update SO write offset. The driver provides memory for the update.
566 for (uint32_t i = 0; i < 4; ++i)
567 {
568 if (state.soBuffer[i].pWriteOffset)
569 {
570 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
571 }
572
573 if (state.soBuffer[i].soWriteEnable)
574 {
575 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
576 pDC->dynState.SoWriteOffsetDirty[i] = true;
577 }
578 }
579
580 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
581 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
582
583 RDTSC_END(FEStreamout, 1);
584 }
585
586 #if USE_SIMD16_FRONTEND
587 //////////////////////////////////////////////////////////////////////////
588 /// Is value an even number (a multiple of two)
589 ///
590 template <typename T>
591 INLINE static bool IsEven(T value)
592 {
593 return (value & 1) == 0;
594 }
595
596 //////////////////////////////////////////////////////////////////////////
597 /// Round up value to an even number (a multiple of two)
598 ///
599 template <typename T>
600 INLINE static T RoundUpEven(T value)
601 {
602 return (value + 1) & ~1;
603 }
604
605 //////////////////////////////////////////////////////////////////////////
606 /// Round down value to an even number (a multiple of two)
607 ///
608 template <typename T>
609 INLINE static T RoundDownEven(T value)
610 {
611 return value & ~1;
612 }
613
614 //////////////////////////////////////////////////////////////////////////
615 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
616 ///
617 /// vertexCount is in terms of the source simdvertexes and must be even
618 ///
619 /// attribCount will limit the vector copies to those attribs specified
620 ///
621 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
622 ///
623 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
624 {
625 SWR_ASSERT(vertex);
626 SWR_ASSERT(vertex_simd16);
627 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
628
629 simd16vertex temp;
630
631 for (uint32_t i = 0; i < vertexCount; i += 2)
632 {
633 for (uint32_t j = 0; j < attribCount; j += 1)
634 {
635 for (uint32_t k = 0; k < 4; k += 1)
636 {
637 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
638
639 if ((i + 1) < vertexCount)
640 {
641 temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
642 }
643 }
644 }
645
646 for (uint32_t j = 0; j < attribCount; j += 1)
647 {
648 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
649 }
650 }
651 }
652
653 #endif
654 //////////////////////////////////////////////////////////////////////////
655 /// @brief Computes number of invocations. The current index represents
656 /// the start of the SIMD. The max index represents how much work
657 /// items are remaining. If there is less then a SIMD's xmin of work
658 /// then return the remaining amount of work.
659 /// @param curIndex - The start index for the SIMD.
660 /// @param maxIndex - The last index for all work items.
661 static INLINE uint32_t GetNumInvocations(
662 uint32_t curIndex,
663 uint32_t maxIndex)
664 {
665 uint32_t remainder = (maxIndex - curIndex);
666 #if USE_SIMD16_FRONTEND
667 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
668 #else
669 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
670 #endif
671 }
672
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
675 /// The geometry shader will loop over each active streamout buffer, assembling
676 /// primitives for the downstream stages. When multistream output is enabled,
677 /// the generated stream ID buffer from the GS needs to be converted to a cut
678 /// buffer for the primitive assembler.
679 /// @param stream - stream id to generate the cut buffer for
680 /// @param pStreamIdBase - pointer to the stream ID buffer
681 /// @param numEmittedVerts - Number of total verts emitted by the GS
682 /// @param pCutBuffer - output buffer to write cuts to
683 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
684 {
685 SWR_ASSERT(stream < MAX_SO_STREAMS);
686
687 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
688 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
689
690 for (uint32_t b = 0; b < numOutputBytes; ++b)
691 {
692 uint8_t curInputByte = pStreamIdBase[2*b];
693 uint8_t outByte = 0;
694 for (uint32_t i = 0; i < 4; ++i)
695 {
696 if ((curInputByte & 0x3) != stream)
697 {
698 outByte |= (1 << i);
699 }
700 curInputByte >>= 2;
701 }
702
703 curInputByte = pStreamIdBase[2 * b + 1];
704 for (uint32_t i = 0; i < 4; ++i)
705 {
706 if ((curInputByte & 0x3) != stream)
707 {
708 outByte |= (1 << (i + 4));
709 }
710 curInputByte >>= 2;
711 }
712
713 *pCutBuffer++ = outByte;
714 }
715 }
716
717 // Buffers that are allocated if GS is enabled
718 struct GsBuffers
719 {
720 uint8_t* pGsIn;
721 uint8_t* pGsOut[KNOB_SIMD_WIDTH];
722 uint8_t* pGsTransposed;
723 void* pStreamCutBuffer;
724 };
725
726 //////////////////////////////////////////////////////////////////////////
727 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
728 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
729 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
730 /// @param numVerts - Number of vertices outputted by the GS
731 /// @param numAttribs - Number of attributes per vertex
732 template<typename SIMD_T, uint32_t SimdWidth>
733 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
734 {
735 uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
736 uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
737
738 OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
739
740 for (uint32_t i = 0; i < SimdWidth; ++i)
741 {
742 gatherOffsets[i] = srcVertexStride * i;
743 }
744 auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
745
746 uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
747 uint32_t remainingVerts = numVerts;
748
749 for (uint32_t s = 0; s < numSimd; ++s)
750 {
751 uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
752 uint8_t* pDstBase = pDst + s * dstVertexStride;
753
754 // Compute mask to prevent src overflow
755 uint32_t mask = std::min(remainingVerts, SimdWidth);
756 mask = GenMask(mask);
757 auto vMask = SIMD_T::vmask_ps(mask);
758 auto viMask = SIMD_T::castps_si(vMask);
759
760 for (uint32_t a = 0; a < numAttribs; ++a)
761 {
762 auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
763 auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
764 auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
765 auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
766
767 SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
768 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
769 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
770 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
771
772 pSrcBase += sizeof(float) * 4;
773 pDstBase += sizeof(Float<SIMD_T>) * 4;
774 }
775 remainingVerts -= SimdWidth;
776 }
777 }
778
779
780 //////////////////////////////////////////////////////////////////////////
781 /// @brief Implements GS stage.
782 /// @param pDC - pointer to draw context.
783 /// @param workerId - thread's worker id. Even thread has a unique id.
784 /// @param pa - The primitive assembly object.
785 /// @param pGsOut - output stream for GS
786 template <
787 typename HasStreamOutT,
788 typename HasRastT>
789 static void GeometryShaderStage(
790 DRAW_CONTEXT *pDC,
791 uint32_t workerId,
792 PA_STATE& pa,
793 GsBuffers* pGsBuffers,
794 uint32_t* pSoPrimData,
795 #if USE_SIMD16_FRONTEND
796 uint32_t numPrims_simd8,
797 #endif
798 simdscalari const &primID)
799 {
800 RDTSC_BEGIN(FEGeometryShader, pDC->drawId);
801
802 const API_STATE& state = GetApiState(pDC);
803 const SWR_GS_STATE* pState = &state.gsState;
804 SWR_GS_CONTEXT gsContext;
805
806 static uint8_t sNullBuffer[128] = { 0 };
807
808 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
809 {
810 gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
811 }
812 gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
813 gsContext.PrimitiveID = primID;
814
815 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
816 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
817
818 // assemble all attributes for the input primitive
819 gsContext.inputVertStride = pState->inputVertStride;
820 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
821 {
822 uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
823 uint32_t attribSlot = pState->vertexAttribOffset + slot;
824 pa.Assemble(srcAttribSlot, attrib);
825
826 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
827 {
828 gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
829 }
830 }
831
832 // assemble position
833 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
834 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
835 {
836 gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
837 }
838
839 // record valid prims from the frontend to avoid over binning the newly generated
840 // prims from the GS
841 #if USE_SIMD16_FRONTEND
842 uint32_t numInputPrims = numPrims_simd8;
843 #else
844 uint32_t numInputPrims = pa.NumPrims();
845 #endif
846
847 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
848 {
849 gsContext.InstanceID = instance;
850 gsContext.mask = GenerateMask(numInputPrims);
851
852 // execute the geometry shader
853 state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
854 AR_EVENT(GSStats(gsContext.stats.numInstExecuted));
855
856 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
857 {
858 gsContext.pStreams[i] += pState->allocationSize;
859 }
860 }
861
862 // set up new binner and state for the GS output topology
863 #if USE_SIMD16_FRONTEND
864 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
865 if (HasRastT::value)
866 {
867 switch (pState->outputTopology)
868 {
869 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break;
870 case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break;
871 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
872 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
873 }
874 }
875
876 #else
877 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
878 if (HasRastT::value)
879 {
880 switch (pState->outputTopology)
881 {
882 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
883 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
884 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
885 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
886 }
887 }
888
889 #endif
890 // foreach input prim:
891 // - setup a new PA based on the emitted verts for that prim
892 // - loop over the new verts, calling PA to assemble each prim
893 uint32_t* pPrimitiveId = (uint32_t*)&primID;
894
895 uint32_t totalPrimsGenerated = 0;
896 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
897 {
898 uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
899
900 // Vertex count is either emitted by shader or static
901 uint32_t vertexCount = 0;
902 if (pState->staticVertexCount)
903 {
904 vertexCount = pState->staticVertexCount;
905 }
906 else
907 {
908 // If emitted in shader, it should be the stored in the first dword of the output buffer
909 vertexCount = *(uint32_t*)pInstanceBase;
910 }
911
912 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
913 {
914 uint32_t numEmittedVerts = vertexCount;
915 if (numEmittedVerts == 0)
916 {
917 continue;
918 }
919
920 uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
921 uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
922 uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
923
924 #if USE_SIMD16_FRONTEND
925 TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
926 #else
927 TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
928 #endif
929
930 uint32_t numAttribs = state.feNumAttributes;
931
932 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
933 {
934 bool processCutVerts = false;
935 uint8_t* pCutBuffer = pCutBase;
936
937 // assign default stream ID, only relevant when GS is outputting a single stream
938 uint32_t streamID = 0;
939 if (pState->isSingleStream)
940 {
941 processCutVerts = true;
942 streamID = pState->singleStreamID;
943 if (streamID != stream) continue;
944 }
945 else
946 {
947 // early exit if this stream is not enabled for streamout
948 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
949 {
950 continue;
951 }
952
953 // multi-stream output, need to translate StreamID buffer to a cut buffer
954 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
955 pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
956 processCutVerts = false;
957 }
958
959 #if USE_SIMD16_FRONTEND
960 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
961
962 #else
963 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
964
965 #endif
966 while (gsPa.GetNextStreamOutput())
967 {
968 do
969 {
970 #if USE_SIMD16_FRONTEND
971 simd16vector attrib_simd16[3];
972
973 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
974
975 #else
976 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
977
978 #endif
979 if (assemble)
980 {
981 totalPrimsGenerated += gsPa.NumPrims();
982
983 if (HasStreamOutT::value)
984 {
985 #if ENABLE_AVX512_SIMD16
986 gsPa.useAlternateOffset = false;
987 #endif
988 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
989 }
990
991 if (HasRastT::value && state.soState.streamToRasterizer == stream)
992 {
993 #if USE_SIMD16_FRONTEND
994 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
995
996 // Gather data from the SVG if provided.
997 simd16scalari vViewportIdx = SIMD16::setzero_si();
998 simd16scalari vRtIdx = SIMD16::setzero_si();
999 SIMD16::Vec4 svgAttrib[4];
1000
1001 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1002 {
1003 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1004 }
1005
1006
1007 if (state.backendState.readViewportArrayIndex)
1008 {
1009 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1010 gsPa.viewportArrayActive = true;
1011 }
1012 if (state.backendState.readRenderTargetArrayIndex)
1013 {
1014 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1015 gsPa.rtArrayActive = true;
1016 }
1017
1018 {
1019 // OOB VPAI indices => forced to zero.
1020 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1021 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1022 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1023 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1024
1025 gsPa.useAlternateOffset = false;
1026 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1027 }
1028 #else
1029 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1030
1031 // Gather data from the SVG if provided.
1032 simdscalari vViewportIdx = SIMD::setzero_si();
1033 simdscalari vRtIdx = SIMD::setzero_si();
1034 SIMD::Vec4 svgAttrib[4];
1035
1036 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1037 {
1038 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1039 }
1040
1041
1042 if (state.backendState.readViewportArrayIndex)
1043 {
1044 vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1045
1046 // OOB VPAI indices => forced to zero.
1047 vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1048 simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1049 simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1050 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1051 gsPa.viewportArrayActive = true;
1052 }
1053 if (state.backendState.readRenderTargetArrayIndex)
1054 {
1055 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1056 gsPa.rtArrayActive = true;
1057 }
1058
1059 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1060 #endif
1061 }
1062 }
1063 } while (gsPa.NextPrim());
1064 }
1065 }
1066 }
1067 }
1068
1069 // update GS pipeline stats
1070 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1071 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1072 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1073 RDTSC_END(FEGeometryShader, 1);
1074 }
1075
1076 //////////////////////////////////////////////////////////////////////////
1077 /// @brief Allocate GS buffers
1078 /// @param pDC - pointer to draw context.
1079 /// @param state - API state
1080 /// @param ppGsOut - pointer to GS output buffer allocation
1081 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1082 template<typename SIMD_T, uint32_t SIMD_WIDTH>
1083 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
1084 {
1085 auto pArena = pDC->pArena;
1086 SWR_ASSERT(pArena != nullptr);
1087 SWR_ASSERT(state.gsState.gsEnable);
1088
1089 const SWR_GS_STATE& gsState = state.gsState;
1090
1091 // Allocate storage for vertex inputs
1092 uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1093 pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1094
1095 // Allocate arena space to hold GS output verts
1096 const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1097
1098 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1099 {
1100 pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1101 }
1102
1103 // Allocate storage for transposed GS output
1104 uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1105 uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1106 pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1107
1108 // Allocate storage to hold temporary stream->cut buffer, if necessary
1109 if (state.gsState.isSingleStream)
1110 {
1111 pGsBuffers->pStreamCutBuffer = nullptr;
1112 }
1113 else
1114 {
1115 pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1116 }
1117 }
1118
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief Contains all data generated by the HS and passed to the
1121 /// tessellator and DS.
1122 struct TessellationThreadLocalData
1123 {
1124 SWR_HS_CONTEXT hsContext;
1125 ScalarPatch patchData[KNOB_SIMD_WIDTH];
1126 void* pTxCtx;
1127 size_t tsCtxSize;
1128
1129 simdscalar* pDSOutput;
1130 size_t dsOutputAllocSize;
1131 };
1132
1133 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1134
1135 //////////////////////////////////////////////////////////////////////////
1136 /// @brief Allocate tessellation data for this worker thread.
1137 INLINE
1138 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1139 {
1140 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1141 if (gt_pTessellationThreadData == nullptr)
1142 {
1143 gt_pTessellationThreadData = (TessellationThreadLocalData*)
1144 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1145 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1146 }
1147 }
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief Implements Tessellation Stages.
1151 /// @param pDC - pointer to draw context.
1152 /// @param workerId - thread's worker id. Even thread has a unique id.
1153 /// @param pa - The primitive assembly object.
1154 /// @param pGsOut - output stream for GS
1155 template <
1156 typename HasGeometryShaderT,
1157 typename HasStreamOutT,
1158 typename HasRastT>
1159 static void TessellationStages(
1160 DRAW_CONTEXT *pDC,
1161 uint32_t workerId,
1162 PA_STATE& pa,
1163 GsBuffers* pGsBuffers,
1164 uint32_t* pSoPrimData,
1165 #if USE_SIMD16_FRONTEND
1166 uint32_t numPrims_simd8,
1167 #endif
1168 simdscalari const &primID)
1169 {
1170 const API_STATE& state = GetApiState(pDC);
1171 const SWR_TS_STATE& tsState = state.tsState;
1172
1173 SWR_ASSERT(gt_pTessellationThreadData);
1174
1175 HANDLE tsCtx = TSInitCtx(
1176 tsState.domain,
1177 tsState.partitioning,
1178 tsState.tsOutputTopology,
1179 gt_pTessellationThreadData->pTxCtx,
1180 gt_pTessellationThreadData->tsCtxSize);
1181 if (tsCtx == nullptr)
1182 {
1183 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1184 tsCtx = TSInitCtx(
1185 tsState.domain,
1186 tsState.partitioning,
1187 tsState.tsOutputTopology,
1188 gt_pTessellationThreadData->pTxCtx,
1189 gt_pTessellationThreadData->tsCtxSize);
1190 }
1191 SWR_ASSERT(tsCtx);
1192
1193 #if USE_SIMD16_FRONTEND
1194 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1195 if (HasRastT::value)
1196 {
1197 switch (tsState.postDSTopology)
1198 {
1199 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1200 case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break;
1201 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
1202 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1203 }
1204 }
1205
1206 #else
1207 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1208 if (HasRastT::value)
1209 {
1210 switch (tsState.postDSTopology)
1211 {
1212 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1213 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1214 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1215 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1216 }
1217 }
1218
1219 #endif
1220 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1221 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1222 hsContext.PrimitiveID = primID;
1223
1224 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1225 // Max storage for one attribute for an entire simdprimitive
1226 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1227
1228 // assemble all attributes for the input primitives
1229 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1230 {
1231 uint32_t attribSlot = tsState.vertexAttribOffset + slot;
1232 pa.Assemble(attribSlot, simdattrib);
1233
1234 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1235 {
1236 hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
1237 }
1238 }
1239
1240 #if defined(_DEBUG)
1241 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1242 #endif
1243
1244 #if USE_SIMD16_FRONTEND
1245 uint32_t numPrims = numPrims_simd8;
1246 #else
1247 uint32_t numPrims = pa.NumPrims();
1248 #endif
1249 hsContext.mask = GenerateMask(numPrims);
1250
1251 // Run the HS
1252 RDTSC_BEGIN(FEHullShader, pDC->drawId);
1253 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1254 RDTSC_END(FEHullShader, 0);
1255
1256 UPDATE_STAT_FE(HsInvocations, numPrims);
1257 AR_EVENT(HSStats(hsContext.stats.numInstExecuted));
1258
1259 const uint32_t* pPrimId = (const uint32_t*)&primID;
1260
1261 for (uint32_t p = 0; p < numPrims; ++p)
1262 {
1263 // Run Tessellator
1264 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1265 RDTSC_BEGIN(FETessellation, pDC->drawId);
1266 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1267 AR_EVENT(TessPrimCount(1));
1268 RDTSC_END(FETessellation, 0);
1269
1270 if (tsData.NumPrimitives == 0)
1271 {
1272 continue;
1273 }
1274 SWR_ASSERT(tsData.NumDomainPoints);
1275
1276 // Allocate DS Output memory
1277 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1278 #if USE_SIMD16_FRONTEND
1279 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize; // simd8 -> simd16, padding
1280 #else
1281 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1282 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1283 #endif
1284 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1285 {
1286 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1287 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1288 gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1289 }
1290 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1291 SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1292
1293 #if defined(_DEBUG)
1294 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1295 #endif
1296
1297 // Run Domain Shader
1298 SWR_DS_CONTEXT dsContext;
1299 dsContext.PrimitiveID = pPrimId[p];
1300 dsContext.pCpIn = &hsContext.pCPout[p];
1301 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1302 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1303 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1304 dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1305 #if USE_SIMD16_FRONTEND
1306 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1307 #else
1308 dsContext.vectorStride = requiredDSVectorInvocations;
1309 #endif
1310
1311 uint32_t dsInvocations = 0;
1312
1313 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1314 {
1315 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1316
1317 RDTSC_BEGIN(FEDomainShader, pDC->drawId);
1318 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1319 RDTSC_END(FEDomainShader, 0);
1320
1321 AR_EVENT(DSStats(dsContext.stats.numInstExecuted));
1322
1323 dsInvocations += KNOB_SIMD_WIDTH;
1324 }
1325 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1326
1327 #if USE_SIMD16_FRONTEND
1328 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1329
1330 #endif
1331 PA_TESS tessPa(
1332 pDC,
1333 #if USE_SIMD16_FRONTEND
1334 reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
1335 dsContext.vectorStride / 2, // simd8 -> simd16
1336 #else
1337 dsContext.pOutputData,
1338 dsContext.vectorStride,
1339 #endif
1340 SWR_VTX_NUM_SLOTS,
1341 tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1342 tsData.ppIndices,
1343 tsData.NumPrimitives,
1344 tsState.postDSTopology,
1345 NumVertsPerPrim(tsState.postDSTopology, false));
1346
1347 while (tessPa.HasWork())
1348 {
1349 #if USE_SIMD16_FRONTEND
1350 const uint32_t numPrims = tessPa.NumPrims();
1351 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1352 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1353
1354 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1355 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1356 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1357
1358 #endif
1359 if (HasGeometryShaderT::value)
1360 {
1361 #if USE_SIMD16_FRONTEND
1362 tessPa.useAlternateOffset = false;
1363 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1364
1365 if (numPrims_hi)
1366 {
1367 tessPa.useAlternateOffset = true;
1368 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1369 }
1370 #else
1371 GeometryShaderStage<HasStreamOutT, HasRastT>(
1372 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
1373 #endif
1374 }
1375 else
1376 {
1377 if (HasStreamOutT::value)
1378 {
1379 #if ENABLE_AVX512_SIMD16
1380 tessPa.useAlternateOffset = false;
1381 #endif
1382 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1383 }
1384
1385 if (HasRastT::value)
1386 {
1387 #if USE_SIMD16_FRONTEND
1388 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1389 #else
1390 simdvector prim[3]; // Only deal with triangles, lines, or points
1391 #endif
1392 RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
1393 bool assemble =
1394 #if USE_SIMD16_FRONTEND
1395 tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1396 #else
1397 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1398 #endif
1399 RDTSC_END(FEPAAssemble, 1);
1400 SWR_ASSERT(assemble);
1401
1402 SWR_ASSERT(pfnClipFunc);
1403 #if USE_SIMD16_FRONTEND
1404 // Gather data from the SVG if provided.
1405 simd16scalari vViewportIdx = SIMD16::setzero_si();
1406 simd16scalari vRtIdx = SIMD16::setzero_si();
1407 SIMD16::Vec4 svgAttrib[4];
1408
1409 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1410 {
1411 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1412 }
1413
1414
1415 if (state.backendState.readViewportArrayIndex)
1416 {
1417 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1418 tessPa.viewportArrayActive = true;
1419 }
1420 if (state.backendState.readRenderTargetArrayIndex)
1421 {
1422 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1423 tessPa.rtArrayActive = true;
1424 }
1425
1426
1427 {
1428 // OOB VPAI indices => forced to zero.
1429 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1430 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1431 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1432 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1433
1434 tessPa.useAlternateOffset = false;
1435 pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx);
1436 }
1437 #else
1438 // Gather data from the SGV if provided.
1439 simdscalari vViewportIdx = SIMD::setzero_si();
1440 simdscalari vRtIdx = SIMD::setzero_si();
1441 SIMD::Vec4 svgAttrib[4];
1442
1443 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1444 {
1445 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1446 }
1447
1448 if (state.backendState.readViewportArrayIndex)
1449 {
1450 vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1451
1452 // OOB VPAI indices => forced to zero.
1453 vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1454 simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1455 simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1456 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1457 tessPa.viewportArrayActive = true;
1458 }
1459 if (state.backendState.readRenderTargetArrayIndex)
1460 {
1461 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1462 tessPa.rtArrayActive = true;
1463 }
1464 pfnClipFunc(pDC, tessPa, workerId, prim,
1465 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx);
1466 #endif
1467 }
1468 }
1469
1470 tessPa.NextPrim();
1471
1472 } // while (tessPa.HasWork())
1473 } // for (uint32_t p = 0; p < numPrims; ++p)
1474
1475 #if USE_SIMD16_FRONTEND
1476 if (gt_pTessellationThreadData->pDSOutput != nullptr)
1477 {
1478 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1479 gt_pTessellationThreadData->pDSOutput = nullptr;
1480 }
1481 gt_pTessellationThreadData->dsOutputAllocSize = 0;
1482
1483 #endif
1484 TSDestroyCtx(tsCtx);
1485 }
1486
1487 THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
1488 THREAD uint32_t gVertexStoreSize = 0;
1489
1490 //////////////////////////////////////////////////////////////////////////
1491 /// @brief FE handler for SwrDraw.
1492 /// @tparam IsIndexedT - Is indexed drawing enabled
1493 /// @tparam HasTessellationT - Is tessellation enabled
1494 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1495 /// @tparam HasStreamOutT - Is stream-out enabled
1496 /// @tparam HasRastT - Is rasterization enabled
1497 /// @param pContext - pointer to SWR context.
1498 /// @param pDC - pointer to draw context.
1499 /// @param workerId - thread's worker id.
1500 /// @param pUserData - Pointer to DRAW_WORK
1501 template <
1502 typename IsIndexedT,
1503 typename IsCutIndexEnabledT,
1504 typename HasTessellationT,
1505 typename HasGeometryShaderT,
1506 typename HasStreamOutT,
1507 typename HasRastT>
1508 void ProcessDraw(
1509 SWR_CONTEXT *pContext,
1510 DRAW_CONTEXT *pDC,
1511 uint32_t workerId,
1512 void *pUserData)
1513 {
1514
1515 #if KNOB_ENABLE_TOSS_POINTS
1516 if (KNOB_TOSS_QUEUE_FE)
1517 {
1518 return;
1519 }
1520 #endif
1521
1522 RDTSC_BEGIN(FEProcessDraw, pDC->drawId);
1523
1524 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1525 const API_STATE& state = GetApiState(pDC);
1526
1527 uint32_t indexSize = 0;
1528 uint32_t endVertex = work.numVerts;
1529
1530 gfxptr_t xpLastRequestedIndex = 0;
1531 if (IsIndexedT::value)
1532 {
1533 switch (work.type)
1534 {
1535 case R32_UINT:
1536 indexSize = sizeof(uint32_t);
1537 break;
1538 case R16_UINT:
1539 indexSize = sizeof(uint16_t);
1540 break;
1541 case R8_UINT:
1542 indexSize = sizeof(uint8_t);
1543 break;
1544 default:
1545 SWR_INVALID("Invalid work.type: %d", work.type);
1546 }
1547 xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1548 }
1549 else
1550 {
1551 // No cuts, prune partial primitives.
1552 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1553 }
1554
1555 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1556 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1557 #endif
1558
1559 GsBuffers gsBuffers;
1560 if (HasGeometryShaderT::value)
1561 {
1562 #if USE_SIMD16_FRONTEND
1563 AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1564 #else
1565 AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1566 #endif
1567 }
1568
1569 if (HasTessellationT::value)
1570 {
1571 SWR_ASSERT(state.tsState.tsEnable == true);
1572 SWR_ASSERT(state.pfnHsFunc != nullptr);
1573 SWR_ASSERT(state.pfnDsFunc != nullptr);
1574
1575 AllocateTessellationData(pContext);
1576 }
1577 else
1578 {
1579 SWR_ASSERT(state.tsState.tsEnable == false);
1580 SWR_ASSERT(state.pfnHsFunc == nullptr);
1581 SWR_ASSERT(state.pfnDsFunc == nullptr);
1582 }
1583
1584 // allocate space for streamout input prim data
1585 uint32_t* pSoPrimData = nullptr;
1586 if (HasStreamOutT::value)
1587 {
1588 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1589 }
1590
1591 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1592 #if USE_SIMD16_FRONTEND
1593 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1594 #else
1595 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1596 #endif
1597
1598 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1599
1600 // Compute storage requirements for vertex store
1601 // TODO: allocation needs to be rethought for better cut support
1602 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1603 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1604
1605 // grow the vertex store for the PA as necessary
1606 if (gVertexStoreSize < vertexStoreSize)
1607 {
1608 if (gpVertexStore != nullptr)
1609 {
1610 AlignedFree(gpVertexStore);
1611 gpVertexStore = nullptr;
1612 }
1613
1614 SWR_ASSERT(gpVertexStore == nullptr);
1615
1616 gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
1617 gVertexStoreSize = vertexStoreSize;
1618
1619 SWR_ASSERT(gpVertexStore != nullptr);
1620 }
1621
1622 // choose primitive assembler
1623
1624 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
1625 PA_STATE& pa = paFactory.GetPA();
1626
1627 #if USE_SIMD16_FRONTEND
1628 #if USE_SIMD16_SHADERS
1629 simd16vertex vin;
1630 #else
1631 simdvertex vin_lo;
1632 simdvertex vin_hi;
1633 #endif
1634 SWR_VS_CONTEXT vsContext_lo;
1635 SWR_VS_CONTEXT vsContext_hi;
1636
1637 #if USE_SIMD16_SHADERS
1638 vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin);
1639 vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin);
1640 #else
1641 vsContext_lo.pVin = &vin_lo;
1642 vsContext_hi.pVin = &vin_hi;
1643 #endif
1644 vsContext_lo.AlternateOffset = 0;
1645 vsContext_hi.AlternateOffset = 1;
1646
1647 SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
1648
1649 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1650 fetchInfo_lo.StartInstance = work.startInstance;
1651 fetchInfo_lo.StartVertex = 0;
1652
1653 if (IsIndexedT::value)
1654 {
1655 fetchInfo_lo.BaseVertex = work.baseVertex;
1656
1657 // if the entire index buffer isn't being consumed, set the last index
1658 // so that fetches < a SIMD wide will be masked off
1659 fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1660 if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1661 {
1662 fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1663 }
1664 }
1665 else
1666 {
1667 fetchInfo_lo.StartVertex = work.startVertex;
1668 }
1669
1670 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1671
1672 const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1673
1674 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1675 {
1676 uint32_t i = 0;
1677
1678 simd16scalari vIndex;
1679
1680 if (IsIndexedT::value)
1681 {
1682 fetchInfo_lo.xpIndices = work.xpIB;
1683 fetchInfo_hi.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1684 }
1685 else
1686 {
1687 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1688
1689 fetchInfo_lo.xpIndices = (gfxptr_t)&vIndex;
1690 fetchInfo_hi.xpIndices = (gfxptr_t)&vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t); // 1/2 of KNOB_SIMD16_WIDTH
1691 }
1692
1693 fetchInfo_lo.CurInstance = instanceNum;
1694 fetchInfo_hi.CurInstance = instanceNum;
1695
1696 vsContext_lo.InstanceID = instanceNum;
1697 vsContext_hi.InstanceID = instanceNum;
1698
1699 while (pa.HasWork())
1700 {
1701 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1702 // So we need to keep this outside of (i < endVertex) check.
1703
1704 simdmask *pvCutIndices_lo = nullptr;
1705 simdmask *pvCutIndices_hi = nullptr;
1706
1707 if (IsIndexedT::value)
1708 {
1709 // simd16mask <=> simdmask[2]
1710
1711 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1712 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1713 }
1714
1715 simd16vertex &vout = pa.GetNextVsOutput();
1716
1717 vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1718 vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1719
1720 if (i < endVertex)
1721 {
1722 if (!IsIndexedT::value)
1723 {
1724 fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1725 uint32_t offset;
1726 offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH);
1727 #if USE_SIMD16_SHADERS
1728 offset *= 4; // convert from index to address
1729 fetchInfo_lo.xpLastIndex += offset;
1730 #else
1731 fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH) * 4; // * 4 for converting index to address
1732 uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
1733 assert(offset >= 0);
1734 fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1735 fetchInfo_hi.xpLastIndex += offset2 * 4; // * 4 for converting index to address
1736 #endif
1737 }
1738 // 1. Execute FS/VS for a single SIMD.
1739 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
1740 #if USE_SIMD16_SHADERS
1741 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin);
1742 #else
1743 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin_lo);
1744
1745 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1746 {
1747 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi);
1748 }
1749 #endif
1750 RDTSC_END(FEFetchShader, 0);
1751
1752 // forward fetch generated vertex IDs to the vertex shader
1753 #if USE_SIMD16_SHADERS
1754 #if USE_SIMD16_VS
1755 vsContext_lo.VertexID16 = _simd16_insert_si(
1756 vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1757 vsContext_lo.VertexID16 = _simd16_insert_si(
1758 vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1759 #else
1760 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1761 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1762 #endif
1763 #else
1764 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1765 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1766 #endif
1767
1768 // Setup active mask for vertex shader.
1769 #if USE_SIMD16_VS
1770 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1771 #else
1772 vsContext_lo.mask = GenerateMask(endVertex - i);
1773 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1774 #endif
1775
1776 // forward cut mask to the PA
1777 if (IsIndexedT::value)
1778 {
1779 #if USE_SIMD16_SHADERS
1780 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1781 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1782 #else
1783 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1784 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1785 #endif
1786 }
1787
1788 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1789
1790 #if KNOB_ENABLE_TOSS_POINTS
1791 if (!KNOB_TOSS_FETCH)
1792 #endif
1793 {
1794 RDTSC_BEGIN(FEVertexShader, pDC->drawId);
1795 #if USE_SIMD16_VS
1796 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1797 AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted));
1798 #else
1799 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1800 AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted));
1801
1802 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1803 {
1804 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1805 AR_EVENT(VSStats(vsContext_hi.stats.numInstExecuted));
1806 }
1807 #endif
1808 RDTSC_END(FEVertexShader, 0);
1809
1810 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1811 }
1812 }
1813
1814 // 2. Assemble primitives given the last two SIMD.
1815 do
1816 {
1817 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1818
1819 RDTSC_START(FEPAAssemble);
1820 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1821 RDTSC_STOP(FEPAAssemble, 1, 0);
1822
1823 #if KNOB_ENABLE_TOSS_POINTS
1824 if (!KNOB_TOSS_FETCH)
1825 #endif
1826 {
1827 #if KNOB_ENABLE_TOSS_POINTS
1828 if (!KNOB_TOSS_VS)
1829 #endif
1830 {
1831 if (assemble)
1832 {
1833 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1834
1835 const uint32_t numPrims = pa.NumPrims();
1836 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1837 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1838
1839 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1840 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1841 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1842
1843 if (HasTessellationT::value)
1844 {
1845 pa.useAlternateOffset = false;
1846 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1847
1848 if (numPrims_hi)
1849 {
1850 pa.useAlternateOffset = true;
1851 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1852 }
1853 }
1854 else if (HasGeometryShaderT::value)
1855 {
1856 pa.useAlternateOffset = false;
1857 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1858
1859 if (numPrims_hi)
1860 {
1861 pa.useAlternateOffset = true;
1862 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1863 }
1864 }
1865 else
1866 {
1867 // If streamout is enabled then stream vertices out to memory.
1868 if (HasStreamOutT::value)
1869 {
1870 pa.useAlternateOffset = false;
1871 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1872 }
1873
1874 if (HasRastT::value)
1875 {
1876 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1877 // Gather data from the SVG if provided.
1878 simd16scalari vpai = SIMD16::setzero_si();
1879 simd16scalari rtai = SIMD16::setzero_si();
1880 SIMD16::Vec4 svgAttrib[4];
1881
1882 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1883 {
1884 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1885 }
1886
1887
1888 if (state.backendState.readViewportArrayIndex)
1889 {
1890 vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1891 pa.viewportArrayActive = true;
1892 }
1893 if (state.backendState.readRenderTargetArrayIndex)
1894 {
1895 rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1896 pa.rtArrayActive = true;
1897 }
1898
1899 {
1900 // OOB VPAI indices => forced to zero.
1901 vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
1902 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1903 simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports);
1904 vpai = SIMD16::and_si(vClearMask, vpai);
1905
1906 pa.useAlternateOffset = false;
1907 pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai);
1908 }
1909 }
1910 }
1911 }
1912 }
1913 }
1914 } while (pa.NextPrim());
1915
1916 if (IsIndexedT::value)
1917 {
1918 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
1919 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
1920 }
1921 else
1922 {
1923 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1924 }
1925
1926 i += KNOB_SIMD16_WIDTH;
1927 }
1928
1929 pa.Reset();
1930 }
1931
1932 #else
1933 SWR_VS_CONTEXT vsContext;
1934 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1935
1936 fetchInfo.pStreams = &state.vertexBuffers[0];
1937 fetchInfo.StartInstance = work.startInstance;
1938 fetchInfo.StartVertex = 0;
1939
1940 if (IsIndexedT::value)
1941 {
1942 fetchInfo.BaseVertex = work.baseVertex;
1943
1944 // if the entire index buffer isn't being consumed, set the last index
1945 // so that fetches < a SIMD wide will be masked off
1946 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1947 if (xpLastRequestedIndex < fetchInfo.pLastIndex)
1948 {
1949 fetchInfo.pLastIndex = xpLastRequestedIndex;
1950 }
1951 }
1952 else
1953 {
1954 fetchInfo.StartVertex = work.startVertex;
1955 }
1956
1957 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1958
1959 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1960 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1961 {
1962 simdscalari vIndex;
1963 uint32_t i = 0;
1964
1965 if (IsIndexedT::value)
1966 {
1967 fetchInfo.pIndices = work.pIB;
1968 }
1969 else
1970 {
1971 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1972 fetchInfo.pIndices = (const int32_t*)&vIndex;
1973 }
1974
1975 fetchInfo.CurInstance = instanceNum;
1976 vsContext.InstanceID = instanceNum;
1977
1978 while (pa.HasWork())
1979 {
1980 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1981 // So we need to keep this outside of (i < endVertex) check.
1982 simdmask* pvCutIndices = nullptr;
1983 if (IsIndexedT::value)
1984 {
1985 pvCutIndices = &pa.GetNextVsIndices();
1986 }
1987
1988 simdvertex& vout = pa.GetNextVsOutput();
1989 vsContext.pVin = &vout;
1990 vsContext.pVout = &vout;
1991
1992 if (i < endVertex)
1993 {
1994
1995 // 1. Execute FS/VS for a single SIMD.
1996 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
1997 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout);
1998 RDTSC_END(FEFetchShader, 0);
1999
2000 // forward fetch generated vertex IDs to the vertex shader
2001 vsContext.VertexID = fetchInfo.VertexID;
2002
2003 // Setup active mask for vertex shader.
2004 vsContext.mask = GenerateMask(endVertex - i);
2005
2006 // forward cut mask to the PA
2007 if (IsIndexedT::value)
2008 {
2009 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2010 }
2011
2012 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2013
2014 #if KNOB_ENABLE_TOSS_POINTS
2015 if (!KNOB_TOSS_FETCH)
2016 #endif
2017 {
2018 RDTSC_BEGIN(FEVertexShader, pDC->drawId);
2019 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
2020 RDTSC_END(FEVertexShader, 0);
2021
2022 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2023 AR_EVENT(VSStats(vsContext.stats.numInstExecuted));
2024 }
2025 }
2026
2027 // 2. Assemble primitives given the last two SIMD.
2028 do
2029 {
2030 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2031 // PaAssemble returns false if there is not enough verts to assemble.
2032 RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
2033 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2034 RDTSC_END(FEPAAssemble, 1);
2035
2036 #if KNOB_ENABLE_TOSS_POINTS
2037 if (!KNOB_TOSS_FETCH)
2038 #endif
2039 {
2040 #if KNOB_ENABLE_TOSS_POINTS
2041 if (!KNOB_TOSS_VS)
2042 #endif
2043 {
2044 if (assemble)
2045 {
2046 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2047
2048 if (HasTessellationT::value)
2049 {
2050 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2051 pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2052 }
2053 else if (HasGeometryShaderT::value)
2054 {
2055 GeometryShaderStage<HasStreamOutT, HasRastT>(
2056 pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2057 }
2058 else
2059 {
2060 // If streamout is enabled then stream vertices out to memory.
2061 if (HasStreamOutT::value)
2062 {
2063 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2064 }
2065
2066 if (HasRastT::value)
2067 {
2068 SWR_ASSERT(pDC->pState->pfnProcessPrims);
2069
2070 // Gather data from the SVG if provided.
2071 simdscalari vViewportIdx = SIMD::setzero_si();
2072 simdscalari vRtIdx = SIMD::setzero_si();
2073 SIMD::Vec4 svgAttrib[4];
2074
2075 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
2076 {
2077 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2078 }
2079
2080 if (state.backendState.readViewportArrayIndex)
2081 {
2082 vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2083
2084 // OOB VPAI indices => forced to zero.
2085 vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2086 simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2087 simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2088 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2089 pa.viewportArrayActive = true;
2090 }
2091 if (state.backendState.readRenderTargetArrayIndex)
2092 {
2093 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2094 pa.rtArrayActive = true;
2095 }
2096
2097 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
2098 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx);
2099 }
2100 }
2101 }
2102 }
2103 }
2104 } while (pa.NextPrim());
2105
2106 if (IsIndexedT::value)
2107 {
2108 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2109 }
2110 else
2111 {
2112 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2113 }
2114
2115 i += KNOB_SIMD_WIDTH;
2116 }
2117 pa.Reset();
2118 }
2119
2120 #endif
2121
2122 RDTSC_END(FEProcessDraw, numPrims * work.numInstances);
2123 }
2124
2125 struct FEDrawChooser
2126 {
2127 typedef PFN_FE_WORK_FUNC FuncType;
2128
2129 template <typename... ArgsB>
2130 static FuncType GetFunc()
2131 {
2132 return ProcessDraw<ArgsB...>;
2133 }
2134 };
2135
2136
2137 // Selector for correct templated Draw front-end function
2138 PFN_FE_WORK_FUNC GetProcessDrawFunc(
2139 bool IsIndexed,
2140 bool IsCutIndexEnabled,
2141 bool HasTessellation,
2142 bool HasGeometryShader,
2143 bool HasStreamOut,
2144 bool HasRasterization)
2145 {
2146 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
2147 }