5d549873f363e88c43061f08003a75bc3a38276b
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
37 #include "utils.h"
38 #include "threads.h"
39 #include "pa.h"
40 #include "clip.h"
41 #include "tilemgr.h"
42 #include "tessellator.h"
43 #include <limits>
44
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE uint32_t GenMask(uint32_t numBits)
48 {
49 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
55 /// raster state.
56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
69 void ProcessSync(
70 SWR_CONTEXT *pContext,
71 DRAW_CONTEXT *pDC,
72 uint32_t workerId,
73 void *pUserData)
74 {
75 BE_WORK work;
76 work.type = SYNC;
77 work.pfnWork = ProcessSyncBE;
78
79 MacroTileMgr *pTileMgr = pDC->pTileMgr;
80 pTileMgr->enqueue(0, 0, &work);
81 }
82
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrDestroyContext.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to sync callback.
89 void ProcessShutdown(
90 SWR_CONTEXT *pContext,
91 DRAW_CONTEXT *pDC,
92 uint32_t workerId,
93 void *pUserData)
94 {
95 BE_WORK work;
96 work.type = SHUTDOWN;
97 work.pfnWork = ProcessShutdownBE;
98
99 MacroTileMgr *pTileMgr = pDC->pTileMgr;
100 // Enqueue at least 1 work item for each worker thread
101 // account for number of numa nodes
102 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
103
104 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
105 {
106 for (uint32_t n = 0; n < numNumaNodes; ++n)
107 {
108 pTileMgr->enqueue(i, n, &work);
109 }
110 }
111 }
112
113 //////////////////////////////////////////////////////////////////////////
114 /// @brief FE handler for SwrClearRenderTarget.
115 /// @param pContext - pointer to SWR context.
116 /// @param pDC - pointer to draw context.
117 /// @param workerId - thread's worker id. Even thread has a unique id.
118 /// @param pUserData - Pointer to user data passed back to clear callback.
119 /// @todo This should go away when we switch this to use compute threading.
120 void ProcessClear(
121 SWR_CONTEXT *pContext,
122 DRAW_CONTEXT *pDC,
123 uint32_t workerId,
124 void *pUserData)
125 {
126 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
127 MacroTileMgr *pTileMgr = pDC->pTileMgr;
128
129 // queue a clear to each macro tile
130 // compute macro tile bounds for the specified rect
131 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
132 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
133 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
134 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
135
136 BE_WORK work;
137 work.type = CLEAR;
138 work.pfnWork = ProcessClearBE;
139 work.desc.clear = *pDesc;
140
141 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
142 {
143 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
144 {
145 pTileMgr->enqueue(x, y, &work);
146 }
147 }
148 }
149
150 //////////////////////////////////////////////////////////////////////////
151 /// @brief FE handler for SwrStoreTiles.
152 /// @param pContext - pointer to SWR context.
153 /// @param pDC - pointer to draw context.
154 /// @param workerId - thread's worker id. Even thread has a unique id.
155 /// @param pUserData - Pointer to user data passed back to callback.
156 /// @todo This should go away when we switch this to use compute threading.
157 void ProcessStoreTiles(
158 SWR_CONTEXT *pContext,
159 DRAW_CONTEXT *pDC,
160 uint32_t workerId,
161 void *pUserData)
162 {
163 AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
164 MacroTileMgr *pTileMgr = pDC->pTileMgr;
165 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
166
167 // queue a store to each macro tile
168 // compute macro tile bounds for the specified rect
169 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
170 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
171 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
172 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
173
174 // store tiles
175 BE_WORK work;
176 work.type = STORETILES;
177 work.pfnWork = ProcessStoreTileBE;
178 work.desc.storeTiles = *pDesc;
179
180 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
181 {
182 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
183 {
184 pTileMgr->enqueue(x, y, &work);
185 }
186 }
187
188 AR_END(FEProcessStoreTiles, 0);
189 }
190
191 //////////////////////////////////////////////////////////////////////////
192 /// @brief FE handler for SwrInvalidateTiles.
193 /// @param pContext - pointer to SWR context.
194 /// @param pDC - pointer to draw context.
195 /// @param workerId - thread's worker id. Even thread has a unique id.
196 /// @param pUserData - Pointer to user data passed back to callback.
197 /// @todo This should go away when we switch this to use compute threading.
198 void ProcessDiscardInvalidateTiles(
199 SWR_CONTEXT *pContext,
200 DRAW_CONTEXT *pDC,
201 uint32_t workerId,
202 void *pUserData)
203 {
204 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
205 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
206 MacroTileMgr *pTileMgr = pDC->pTileMgr;
207
208 // compute macro tile bounds for the specified rect
209 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
210 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
211 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
212 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
213
214 if (pDesc->fullTilesOnly == false)
215 {
216 // include partial tiles
217 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
218 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
219 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
220 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
221 }
222
223 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
224 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
225
226 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
227 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
228
229 // load tiles
230 BE_WORK work;
231 work.type = DISCARDINVALIDATETILES;
232 work.pfnWork = ProcessDiscardInvalidateTilesBE;
233 work.desc.discardInvalidateTiles = *pDesc;
234
235 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
236 {
237 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
238 {
239 pTileMgr->enqueue(x, y, &work);
240 }
241 }
242
243 AR_END(FEProcessInvalidateTiles, 0);
244 }
245
246 //////////////////////////////////////////////////////////////////////////
247 /// @brief Computes the number of primitives given the number of verts.
248 /// @param mode - primitive topology for draw operation.
249 /// @param numPrims - number of vertices or indices for draw.
250 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
251 uint32_t GetNumPrims(
252 PRIMITIVE_TOPOLOGY mode,
253 uint32_t numPrims)
254 {
255 switch (mode)
256 {
257 case TOP_POINT_LIST: return numPrims;
258 case TOP_TRIANGLE_LIST: return numPrims / 3;
259 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
260 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
261 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
262 case TOP_QUAD_LIST: return numPrims / 4;
263 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
264 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
265 case TOP_LINE_LIST: return numPrims / 2;
266 case TOP_LINE_LOOP: return numPrims;
267 case TOP_RECT_LIST: return numPrims / 3;
268 case TOP_LINE_LIST_ADJ: return numPrims / 4;
269 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
270 case TOP_TRI_LIST_ADJ: return numPrims / 6;
271 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
272
273 case TOP_PATCHLIST_1:
274 case TOP_PATCHLIST_2:
275 case TOP_PATCHLIST_3:
276 case TOP_PATCHLIST_4:
277 case TOP_PATCHLIST_5:
278 case TOP_PATCHLIST_6:
279 case TOP_PATCHLIST_7:
280 case TOP_PATCHLIST_8:
281 case TOP_PATCHLIST_9:
282 case TOP_PATCHLIST_10:
283 case TOP_PATCHLIST_11:
284 case TOP_PATCHLIST_12:
285 case TOP_PATCHLIST_13:
286 case TOP_PATCHLIST_14:
287 case TOP_PATCHLIST_15:
288 case TOP_PATCHLIST_16:
289 case TOP_PATCHLIST_17:
290 case TOP_PATCHLIST_18:
291 case TOP_PATCHLIST_19:
292 case TOP_PATCHLIST_20:
293 case TOP_PATCHLIST_21:
294 case TOP_PATCHLIST_22:
295 case TOP_PATCHLIST_23:
296 case TOP_PATCHLIST_24:
297 case TOP_PATCHLIST_25:
298 case TOP_PATCHLIST_26:
299 case TOP_PATCHLIST_27:
300 case TOP_PATCHLIST_28:
301 case TOP_PATCHLIST_29:
302 case TOP_PATCHLIST_30:
303 case TOP_PATCHLIST_31:
304 case TOP_PATCHLIST_32:
305 return numPrims / (mode - TOP_PATCHLIST_BASE);
306
307 case TOP_POLYGON:
308 case TOP_POINT_LIST_BF:
309 case TOP_LINE_STRIP_CONT:
310 case TOP_LINE_STRIP_BF:
311 case TOP_LINE_STRIP_CONT_BF:
312 case TOP_TRIANGLE_FAN_NOSTIPPLE:
313 case TOP_TRI_STRIP_REVERSE:
314 case TOP_PATCHLIST_BASE:
315 case TOP_UNKNOWN:
316 SWR_ASSERT(false, "Unsupported topology: %d", mode);
317 return 0;
318 }
319
320 return 0;
321 }
322
323 //////////////////////////////////////////////////////////////////////////
324 /// @brief Computes the number of verts given the number of primitives.
325 /// @param mode - primitive topology for draw operation.
326 /// @param numPrims - number of primitives for draw.
327 uint32_t GetNumVerts(
328 PRIMITIVE_TOPOLOGY mode,
329 uint32_t numPrims)
330 {
331 switch (mode)
332 {
333 case TOP_POINT_LIST: return numPrims;
334 case TOP_TRIANGLE_LIST: return numPrims * 3;
335 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
336 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
337 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
338 case TOP_QUAD_LIST: return numPrims * 4;
339 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
340 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
341 case TOP_LINE_LIST: return numPrims * 2;
342 case TOP_LINE_LOOP: return numPrims;
343 case TOP_RECT_LIST: return numPrims * 3;
344 case TOP_LINE_LIST_ADJ: return numPrims * 4;
345 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
346 case TOP_TRI_LIST_ADJ: return numPrims * 6;
347 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
348
349 case TOP_PATCHLIST_1:
350 case TOP_PATCHLIST_2:
351 case TOP_PATCHLIST_3:
352 case TOP_PATCHLIST_4:
353 case TOP_PATCHLIST_5:
354 case TOP_PATCHLIST_6:
355 case TOP_PATCHLIST_7:
356 case TOP_PATCHLIST_8:
357 case TOP_PATCHLIST_9:
358 case TOP_PATCHLIST_10:
359 case TOP_PATCHLIST_11:
360 case TOP_PATCHLIST_12:
361 case TOP_PATCHLIST_13:
362 case TOP_PATCHLIST_14:
363 case TOP_PATCHLIST_15:
364 case TOP_PATCHLIST_16:
365 case TOP_PATCHLIST_17:
366 case TOP_PATCHLIST_18:
367 case TOP_PATCHLIST_19:
368 case TOP_PATCHLIST_20:
369 case TOP_PATCHLIST_21:
370 case TOP_PATCHLIST_22:
371 case TOP_PATCHLIST_23:
372 case TOP_PATCHLIST_24:
373 case TOP_PATCHLIST_25:
374 case TOP_PATCHLIST_26:
375 case TOP_PATCHLIST_27:
376 case TOP_PATCHLIST_28:
377 case TOP_PATCHLIST_29:
378 case TOP_PATCHLIST_30:
379 case TOP_PATCHLIST_31:
380 case TOP_PATCHLIST_32:
381 return numPrims * (mode - TOP_PATCHLIST_BASE);
382
383 case TOP_POLYGON:
384 case TOP_POINT_LIST_BF:
385 case TOP_LINE_STRIP_CONT:
386 case TOP_LINE_STRIP_BF:
387 case TOP_LINE_STRIP_CONT_BF:
388 case TOP_TRIANGLE_FAN_NOSTIPPLE:
389 case TOP_TRI_STRIP_REVERSE:
390 case TOP_PATCHLIST_BASE:
391 case TOP_UNKNOWN:
392 SWR_ASSERT(false, "Unsupported topology: %d", mode);
393 return 0;
394 }
395
396 return 0;
397 }
398
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Return number of verts per primitive.
401 /// @param topology - topology
402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
403 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
404 {
405 uint32_t numVerts = 0;
406 switch (topology)
407 {
408 case TOP_POINT_LIST:
409 case TOP_POINT_LIST_BF:
410 numVerts = 1;
411 break;
412 case TOP_LINE_LIST:
413 case TOP_LINE_STRIP:
414 case TOP_LINE_LIST_ADJ:
415 case TOP_LINE_LOOP:
416 case TOP_LINE_STRIP_CONT:
417 case TOP_LINE_STRIP_BF:
418 case TOP_LISTSTRIP_ADJ:
419 numVerts = 2;
420 break;
421 case TOP_TRIANGLE_LIST:
422 case TOP_TRIANGLE_STRIP:
423 case TOP_TRIANGLE_FAN:
424 case TOP_TRI_LIST_ADJ:
425 case TOP_TRI_STRIP_ADJ:
426 case TOP_TRI_STRIP_REVERSE:
427 case TOP_RECT_LIST:
428 numVerts = 3;
429 break;
430 case TOP_QUAD_LIST:
431 case TOP_QUAD_STRIP:
432 numVerts = 4;
433 break;
434 case TOP_PATCHLIST_1:
435 case TOP_PATCHLIST_2:
436 case TOP_PATCHLIST_3:
437 case TOP_PATCHLIST_4:
438 case TOP_PATCHLIST_5:
439 case TOP_PATCHLIST_6:
440 case TOP_PATCHLIST_7:
441 case TOP_PATCHLIST_8:
442 case TOP_PATCHLIST_9:
443 case TOP_PATCHLIST_10:
444 case TOP_PATCHLIST_11:
445 case TOP_PATCHLIST_12:
446 case TOP_PATCHLIST_13:
447 case TOP_PATCHLIST_14:
448 case TOP_PATCHLIST_15:
449 case TOP_PATCHLIST_16:
450 case TOP_PATCHLIST_17:
451 case TOP_PATCHLIST_18:
452 case TOP_PATCHLIST_19:
453 case TOP_PATCHLIST_20:
454 case TOP_PATCHLIST_21:
455 case TOP_PATCHLIST_22:
456 case TOP_PATCHLIST_23:
457 case TOP_PATCHLIST_24:
458 case TOP_PATCHLIST_25:
459 case TOP_PATCHLIST_26:
460 case TOP_PATCHLIST_27:
461 case TOP_PATCHLIST_28:
462 case TOP_PATCHLIST_29:
463 case TOP_PATCHLIST_30:
464 case TOP_PATCHLIST_31:
465 case TOP_PATCHLIST_32:
466 numVerts = topology - TOP_PATCHLIST_BASE;
467 break;
468 default:
469 SWR_ASSERT(false, "Unsupported topology: %d", topology);
470 break;
471 }
472
473 if (includeAdjVerts)
474 {
475 switch (topology)
476 {
477 case TOP_LISTSTRIP_ADJ:
478 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
479 case TOP_TRI_STRIP_ADJ:
480 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
481 default: break;
482 }
483 }
484
485 return numVerts;
486 }
487
488 //////////////////////////////////////////////////////////////////////////
489 /// @brief Generate mask from remaining work.
490 /// @param numWorkItems - Number of items being worked on by a SIMD.
491 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
492 {
493 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
494 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
495 return _simd_castps_si(vMask(mask));
496 }
497
498
499 //////////////////////////////////////////////////////////////////////////
500 /// @brief Gather scissor rect data based on per-prim viewport indices.
501 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
502 /// @param pViewportIndex - array of per-primitive vewport indexes.
503 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
504 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
505 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
506 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
507 //
508 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
509 template<size_t SimdWidth>
510 struct GatherScissors
511 {
512 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
513 simdscalari &scisXmin, simdscalari &scisYmin,
514 simdscalari &scisXmax, simdscalari &scisYmax)
515 {
516 SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
517 }
518 };
519
520 template<>
521 struct GatherScissors<8>
522 {
523 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
524 simdscalari &scisXmin, simdscalari &scisYmin,
525 simdscalari &scisXmax, simdscalari &scisYmax)
526 {
527 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
528 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
529 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
530 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
531 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
532 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
533 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
534 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
535 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
536 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
537 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
538 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
539 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
540 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
541 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
542 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
543 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
544 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
545 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
546 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
547 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
548 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
549 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
550 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
551 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
552 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
553 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
554 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
555 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
556 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
557 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
558 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
559 }
560 };
561
562 //////////////////////////////////////////////////////////////////////////
563 /// @brief StreamOut - Streams vertex data out to SO buffers.
564 /// Generally, we are only streaming out a SIMDs worth of triangles.
565 /// @param pDC - pointer to draw context.
566 /// @param workerId - thread's worker id. Even thread has a unique id.
567 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
568 static void StreamOut(
569 DRAW_CONTEXT* pDC,
570 PA_STATE& pa,
571 uint32_t workerId,
572 uint32_t* pPrimData,
573 uint32_t streamIndex)
574 {
575 SWR_CONTEXT *pContext = pDC->pContext;
576
577 AR_BEGIN(FEStreamout, pDC->drawId);
578
579 const API_STATE& state = GetApiState(pDC);
580 const SWR_STREAMOUT_STATE &soState = state.soState;
581
582 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
583
584 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
585 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
586
587 SWR_STREAMOUT_CONTEXT soContext = { 0 };
588
589 // Setup buffer state pointers.
590 for (uint32_t i = 0; i < 4; ++i)
591 {
592 soContext.pBuffer[i] = &state.soBuffer[i];
593 }
594
595 uint32_t numPrims = pa.NumPrims();
596 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
597 {
598 DWORD slot = 0;
599 uint32_t soMask = soState.streamMasks[streamIndex];
600
601 // Write all entries into primitive data buffer for SOS.
602 while (_BitScanForward(&slot, soMask))
603 {
604 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
605 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
606 pa.AssembleSingle(paSlot, primIndex, attrib);
607
608 // Attribute offset is relative offset from start of vertex.
609 // Note that attributes start at slot 1 in the PA buffer. We need to write this
610 // to prim data starting at slot 0. Which is why we do (slot - 1).
611 // Also note: GL works slightly differently, and needs slot 0
612 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
613
614 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
615 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
616 {
617 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
618
619 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
620 }
621 soMask &= ~(1 << slot);
622 }
623
624 // Update pPrimData pointer
625 soContext.pPrimData = pPrimData;
626
627 // Call SOS
628 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
629 state.pfnSoFunc[streamIndex](soContext);
630 }
631
632 // Update SO write offset. The driver provides memory for the update.
633 for (uint32_t i = 0; i < 4; ++i)
634 {
635 if (state.soBuffer[i].pWriteOffset)
636 {
637 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
638 }
639
640 if (state.soBuffer[i].soWriteEnable)
641 {
642 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
643 pDC->dynState.SoWriteOffsetDirty[i] = true;
644 }
645 }
646
647 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
648 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
649
650 AR_END(FEStreamout, 1);
651 }
652
653 //////////////////////////////////////////////////////////////////////////
654 /// @brief Computes number of invocations. The current index represents
655 /// the start of the SIMD. The max index represents how much work
656 /// items are remaining. If there is less then a SIMD's xmin of work
657 /// then return the remaining amount of work.
658 /// @param curIndex - The start index for the SIMD.
659 /// @param maxIndex - The last index for all work items.
660 static INLINE uint32_t GetNumInvocations(
661 uint32_t curIndex,
662 uint32_t maxIndex)
663 {
664 uint32_t remainder = (maxIndex - curIndex);
665 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
666 }
667
668 //////////////////////////////////////////////////////////////////////////
669 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
670 /// The geometry shader will loop over each active streamout buffer, assembling
671 /// primitives for the downstream stages. When multistream output is enabled,
672 /// the generated stream ID buffer from the GS needs to be converted to a cut
673 /// buffer for the primitive assembler.
674 /// @param stream - stream id to generate the cut buffer for
675 /// @param pStreamIdBase - pointer to the stream ID buffer
676 /// @param numEmittedVerts - Number of total verts emitted by the GS
677 /// @param pCutBuffer - output buffer to write cuts to
678 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
679 {
680 SWR_ASSERT(stream < MAX_SO_STREAMS);
681
682 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
683 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
684
685 for (uint32_t b = 0; b < numOutputBytes; ++b)
686 {
687 uint8_t curInputByte = pStreamIdBase[2*b];
688 uint8_t outByte = 0;
689 for (uint32_t i = 0; i < 4; ++i)
690 {
691 if ((curInputByte & 0x3) != stream)
692 {
693 outByte |= (1 << i);
694 }
695 curInputByte >>= 2;
696 }
697
698 curInputByte = pStreamIdBase[2 * b + 1];
699 for (uint32_t i = 0; i < 4; ++i)
700 {
701 if ((curInputByte & 0x3) != stream)
702 {
703 outByte |= (1 << (i + 4));
704 }
705 curInputByte >>= 2;
706 }
707
708 *pCutBuffer++ = outByte;
709 }
710 }
711
712 THREAD SWR_GS_CONTEXT tlsGsContext;
713
714 //////////////////////////////////////////////////////////////////////////
715 /// @brief Implements GS stage.
716 /// @param pDC - pointer to draw context.
717 /// @param workerId - thread's worker id. Even thread has a unique id.
718 /// @param pa - The primitive assembly object.
719 /// @param pGsOut - output stream for GS
720 template <
721 typename HasStreamOutT,
722 typename HasRastT>
723 static void GeometryShaderStage(
724 DRAW_CONTEXT *pDC,
725 uint32_t workerId,
726 PA_STATE& pa,
727 void* pGsOut,
728 void* pCutBuffer,
729 void* pStreamCutBuffer,
730 uint32_t* pSoPrimData,
731 simdscalari primID)
732 {
733 SWR_CONTEXT *pContext = pDC->pContext;
734
735 AR_BEGIN(FEGeometryShader, pDC->drawId);
736
737 const API_STATE& state = GetApiState(pDC);
738 const SWR_GS_STATE* pState = &state.gsState;
739
740 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
741 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
742
743 tlsGsContext.pStream = (uint8_t*)pGsOut;
744 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
745 tlsGsContext.PrimitiveID = primID;
746
747 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
748 simdvector attrib[MAX_ATTRIBUTES];
749
750 // assemble all attributes for the input primitive
751 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
752 {
753 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
754 pa.Assemble(attribSlot, attrib);
755
756 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
757 {
758 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
759 }
760 }
761
762 // assemble position
763 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
764 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
765 {
766 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
767 }
768
769 const uint32_t vertexStride = sizeof(simdvertex);
770 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
771 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
772 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
773 uint32_t cutPrimStride;
774 uint32_t cutInstanceStride;
775
776 if (pState->isSingleStream)
777 {
778 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
779 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
780 }
781 else
782 {
783 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
784 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
785 }
786
787 // record valid prims from the frontend to avoid over binning the newly generated
788 // prims from the GS
789 uint32_t numInputPrims = pa.NumPrims();
790
791 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
792 {
793 tlsGsContext.InstanceID = instance;
794 tlsGsContext.mask = GenerateMask(numInputPrims);
795
796 // execute the geometry shader
797 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
798
799 tlsGsContext.pStream += instanceStride;
800 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
801 }
802
803 // set up new binner and state for the GS output topology
804 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
805 if (HasRastT::value)
806 {
807 switch (pState->outputTopology)
808 {
809 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
810 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
811 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
812 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
813 }
814 }
815
816 // foreach input prim:
817 // - setup a new PA based on the emitted verts for that prim
818 // - loop over the new verts, calling PA to assemble each prim
819 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
820 uint32_t* pPrimitiveId = (uint32_t*)&primID;
821
822 uint32_t totalPrimsGenerated = 0;
823 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
824 {
825 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
826 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
827 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
828 {
829 uint32_t numEmittedVerts = pVertexCount[inputPrim];
830 if (numEmittedVerts == 0)
831 {
832 continue;
833 }
834
835 uint8_t* pBase = pInstanceBase + instance * instanceStride;
836 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
837
838 uint32_t numAttribs = state.feNumAttributes;
839
840 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
841 {
842 bool processCutVerts = false;
843
844 uint8_t* pCutBuffer = pCutBase;
845
846 // assign default stream ID, only relevant when GS is outputting a single stream
847 uint32_t streamID = 0;
848 if (pState->isSingleStream)
849 {
850 processCutVerts = true;
851 streamID = pState->singleStreamID;
852 if (streamID != stream) continue;
853 }
854 else
855 {
856 // early exit if this stream is not enabled for streamout
857 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
858 {
859 continue;
860 }
861
862 // multi-stream output, need to translate StreamID buffer to a cut buffer
863 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
864 pCutBuffer = (uint8_t*)pStreamCutBuffer;
865 processCutVerts = false;
866 }
867
868 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
869
870 while (gsPa.GetNextStreamOutput())
871 {
872 do
873 {
874 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
875
876 if (assemble)
877 {
878 totalPrimsGenerated += gsPa.NumPrims();
879
880 if (HasStreamOutT::value)
881 {
882 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
883 }
884
885 if (HasRastT::value && state.soState.streamToRasterizer == stream)
886 {
887 simdscalari vPrimId;
888 // pull primitiveID from the GS output if available
889 if (state.gsState.emitsPrimitiveID)
890 {
891 simdvector primIdAttrib[3];
892 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
893 vPrimId = _simd_castps_si(primIdAttrib[0].x);
894 }
895 else
896 {
897 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
898 }
899
900 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
901 simdscalari vViewPortIdx;
902 if (state.gsState.emitsViewportArrayIndex)
903 {
904 simdvector vpiAttrib[3];
905 gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
906
907 // OOB indices => forced to zero.
908 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
909 simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
910 vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
911
912 vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
913 }
914 else
915 {
916 vViewPortIdx = _simd_set1_epi32(0);
917 }
918
919 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
920 }
921 }
922 } while (gsPa.NextPrim());
923 }
924 }
925 }
926 }
927
928 // update GS pipeline stats
929 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
930 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
931
932 AR_END(FEGeometryShader, 1);
933 }
934
935 //////////////////////////////////////////////////////////////////////////
936 /// @brief Allocate GS buffers
937 /// @param pDC - pointer to draw context.
938 /// @param state - API state
939 /// @param ppGsOut - pointer to GS output buffer allocation
940 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
941 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
942 void **ppStreamCutBuffer)
943 {
944 auto pArena = pDC->pArena;
945 SWR_ASSERT(pArena != nullptr);
946 SWR_ASSERT(state.gsState.gsEnable);
947 // allocate arena space to hold GS output verts
948 // @todo pack attribs
949 // @todo support multiple streams
950 const uint32_t vertexStride = sizeof(simdvertex);
951 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
952 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
953 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
954
955 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
956 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
957 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
958 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
959
960 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
961 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
962
963 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
964 if (state.gsState.isSingleStream)
965 {
966 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
967 *ppStreamCutBuffer = nullptr;
968 }
969 else
970 {
971 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
972 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
973 }
974
975 }
976
977 //////////////////////////////////////////////////////////////////////////
978 /// @brief Contains all data generated by the HS and passed to the
979 /// tessellator and DS.
980 struct TessellationThreadLocalData
981 {
982 SWR_HS_CONTEXT hsContext;
983 ScalarPatch patchData[KNOB_SIMD_WIDTH];
984 void* pTxCtx;
985 size_t tsCtxSize;
986
987 simdscalar* pDSOutput;
988 size_t numDSOutputVectors;
989 };
990
991 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
992
993 //////////////////////////////////////////////////////////////////////////
994 /// @brief Allocate tessellation data for this worker thread.
995 INLINE
996 static void AllocateTessellationData(SWR_CONTEXT* pContext)
997 {
998 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
999 if (gt_pTessellationThreadData == nullptr)
1000 {
1001 gt_pTessellationThreadData = (TessellationThreadLocalData*)
1002 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1003 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1004 }
1005 }
1006
1007 //////////////////////////////////////////////////////////////////////////
1008 /// @brief Implements Tessellation Stages.
1009 /// @param pDC - pointer to draw context.
1010 /// @param workerId - thread's worker id. Even thread has a unique id.
1011 /// @param pa - The primitive assembly object.
1012 /// @param pGsOut - output stream for GS
1013 template <
1014 typename HasGeometryShaderT,
1015 typename HasStreamOutT,
1016 typename HasRastT>
1017 static void TessellationStages(
1018 DRAW_CONTEXT *pDC,
1019 uint32_t workerId,
1020 PA_STATE& pa,
1021 void* pGsOut,
1022 void* pCutBuffer,
1023 void* pCutStreamBuffer,
1024 uint32_t* pSoPrimData,
1025 simdscalari primID)
1026 {
1027 SWR_CONTEXT *pContext = pDC->pContext;
1028 const API_STATE& state = GetApiState(pDC);
1029 const SWR_TS_STATE& tsState = state.tsState;
1030
1031 SWR_ASSERT(gt_pTessellationThreadData);
1032
1033 HANDLE tsCtx = TSInitCtx(
1034 tsState.domain,
1035 tsState.partitioning,
1036 tsState.tsOutputTopology,
1037 gt_pTessellationThreadData->pTxCtx,
1038 gt_pTessellationThreadData->tsCtxSize);
1039 if (tsCtx == nullptr)
1040 {
1041 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1042 tsCtx = TSInitCtx(
1043 tsState.domain,
1044 tsState.partitioning,
1045 tsState.tsOutputTopology,
1046 gt_pTessellationThreadData->pTxCtx,
1047 gt_pTessellationThreadData->tsCtxSize);
1048 }
1049 SWR_ASSERT(tsCtx);
1050
1051 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1052 if (HasRastT::value)
1053 {
1054 switch (tsState.postDSTopology)
1055 {
1056 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1057 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1058 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1059 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1060 }
1061 }
1062
1063 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1064 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1065 hsContext.PrimitiveID = primID;
1066
1067 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1068 // Max storage for one attribute for an entire simdprimitive
1069 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1070
1071 // assemble all attributes for the input primitives
1072 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1073 {
1074 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1075 pa.Assemble(attribSlot, simdattrib);
1076
1077 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1078 {
1079 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1080 }
1081 }
1082
1083 #if defined(_DEBUG)
1084 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1085 #endif
1086
1087 uint32_t numPrims = pa.NumPrims();
1088 hsContext.mask = GenerateMask(numPrims);
1089
1090 // Run the HS
1091 AR_BEGIN(FEHullShader, pDC->drawId);
1092 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1093 AR_END(FEHullShader, 0);
1094
1095 UPDATE_STAT_FE(HsInvocations, numPrims);
1096
1097 const uint32_t* pPrimId = (const uint32_t*)&primID;
1098
1099 for (uint32_t p = 0; p < numPrims; ++p)
1100 {
1101 // Run Tessellator
1102 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1103 AR_BEGIN(FETessellation, pDC->drawId);
1104 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1105 AR_END(FETessellation, 0);
1106
1107 if (tsData.NumPrimitives == 0)
1108 {
1109 continue;
1110 }
1111 SWR_ASSERT(tsData.NumDomainPoints);
1112
1113 // Allocate DS Output memory
1114 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1115 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1116 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1117 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1118 {
1119 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1120 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1121 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1122 }
1123 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1124 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1125
1126 #if defined(_DEBUG)
1127 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1128 #endif
1129
1130 // Run Domain Shader
1131 SWR_DS_CONTEXT dsContext;
1132 dsContext.PrimitiveID = pPrimId[p];
1133 dsContext.pCpIn = &hsContext.pCPout[p];
1134 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1135 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1136 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1137 dsContext.vectorStride = requiredDSVectorInvocations;
1138
1139 uint32_t dsInvocations = 0;
1140
1141 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1142 {
1143 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1144
1145 AR_BEGIN(FEDomainShader, pDC->drawId);
1146 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1147 AR_END(FEDomainShader, 0);
1148
1149 dsInvocations += KNOB_SIMD_WIDTH;
1150 }
1151 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1152
1153 PA_TESS tessPa(
1154 pDC,
1155 dsContext.pOutputData,
1156 dsContext.vectorStride,
1157 tsState.numDsOutputAttribs,
1158 tsData.ppIndices,
1159 tsData.NumPrimitives,
1160 tsState.postDSTopology);
1161
1162 while (tessPa.HasWork())
1163 {
1164 if (HasGeometryShaderT::value)
1165 {
1166 GeometryShaderStage<HasStreamOutT, HasRastT>(
1167 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1168 _simd_set1_epi32(dsContext.PrimitiveID));
1169 }
1170 else
1171 {
1172 if (HasStreamOutT::value)
1173 {
1174 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1175 }
1176
1177 if (HasRastT::value)
1178 {
1179 simdvector prim[3]; // Only deal with triangles, lines, or points
1180 AR_BEGIN(FEPAAssemble, pDC->drawId);
1181 #if SWR_ENABLE_ASSERTS
1182 bool assemble =
1183 #endif
1184 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1185 AR_END(FEPAAssemble, 1);
1186 SWR_ASSERT(assemble);
1187
1188 SWR_ASSERT(pfnClipFunc);
1189 pfnClipFunc(pDC, tessPa, workerId, prim,
1190 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1191 }
1192 }
1193
1194 tessPa.NextPrim();
1195
1196 } // while (tessPa.HasWork())
1197 } // for (uint32_t p = 0; p < numPrims; ++p)
1198
1199 TSDestroyCtx(tsCtx);
1200 }
1201
1202 //////////////////////////////////////////////////////////////////////////
1203 /// @brief FE handler for SwrDraw.
1204 /// @tparam IsIndexedT - Is indexed drawing enabled
1205 /// @tparam HasTessellationT - Is tessellation enabled
1206 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1207 /// @tparam HasStreamOutT - Is stream-out enabled
1208 /// @tparam HasRastT - Is rasterization enabled
1209 /// @param pContext - pointer to SWR context.
1210 /// @param pDC - pointer to draw context.
1211 /// @param workerId - thread's worker id.
1212 /// @param pUserData - Pointer to DRAW_WORK
1213 template <
1214 typename IsIndexedT,
1215 typename IsCutIndexEnabledT,
1216 typename HasTessellationT,
1217 typename HasGeometryShaderT,
1218 typename HasStreamOutT,
1219 typename HasRastT>
1220 void ProcessDraw(
1221 SWR_CONTEXT *pContext,
1222 DRAW_CONTEXT *pDC,
1223 uint32_t workerId,
1224 void *pUserData)
1225 {
1226
1227 #if KNOB_ENABLE_TOSS_POINTS
1228 if (KNOB_TOSS_QUEUE_FE)
1229 {
1230 return;
1231 }
1232 #endif
1233
1234 AR_BEGIN(FEProcessDraw, pDC->drawId);
1235
1236 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1237 const API_STATE& state = GetApiState(pDC);
1238 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1239 SWR_VS_CONTEXT vsContext;
1240 simdvertex vin;
1241
1242 int indexSize = 0;
1243 uint32_t endVertex = work.numVerts;
1244
1245 const int32_t* pLastRequestedIndex = nullptr;
1246 if (IsIndexedT::value)
1247 {
1248 switch (work.type)
1249 {
1250 case R32_UINT:
1251 indexSize = sizeof(uint32_t);
1252 pLastRequestedIndex = &(work.pIB[endVertex]);
1253 break;
1254 case R16_UINT:
1255 indexSize = sizeof(uint16_t);
1256 // nasty address offset to last index
1257 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1258 break;
1259 case R8_UINT:
1260 indexSize = sizeof(uint8_t);
1261 // nasty address offset to last index
1262 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1263 break;
1264 default:
1265 SWR_ASSERT(0);
1266 }
1267 }
1268 else
1269 {
1270 // No cuts, prune partial primitives.
1271 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1272 }
1273
1274 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1275 fetchInfo.pStreams = &state.vertexBuffers[0];
1276 fetchInfo.StartInstance = work.startInstance;
1277 fetchInfo.StartVertex = 0;
1278
1279 vsContext.pVin = &vin;
1280
1281 if (IsIndexedT::value)
1282 {
1283 fetchInfo.BaseVertex = work.baseVertex;
1284
1285 // if the entire index buffer isn't being consumed, set the last index
1286 // so that fetches < a SIMD wide will be masked off
1287 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1288 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1289 {
1290 fetchInfo.pLastIndex = pLastRequestedIndex;
1291 }
1292 }
1293 else
1294 {
1295 fetchInfo.StartVertex = work.startVertex;
1296 }
1297
1298 #ifdef KNOB_ENABLE_RDTSC
1299 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1300 #endif
1301
1302 void* pGsOut = nullptr;
1303 void* pCutBuffer = nullptr;
1304 void* pStreamCutBuffer = nullptr;
1305 if (HasGeometryShaderT::value)
1306 {
1307 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1308 }
1309
1310 if (HasTessellationT::value)
1311 {
1312 SWR_ASSERT(state.tsState.tsEnable == true);
1313 SWR_ASSERT(state.pfnHsFunc != nullptr);
1314 SWR_ASSERT(state.pfnDsFunc != nullptr);
1315
1316 AllocateTessellationData(pContext);
1317 }
1318 else
1319 {
1320 SWR_ASSERT(state.tsState.tsEnable == false);
1321 SWR_ASSERT(state.pfnHsFunc == nullptr);
1322 SWR_ASSERT(state.pfnDsFunc == nullptr);
1323 }
1324
1325 // allocate space for streamout input prim data
1326 uint32_t* pSoPrimData = nullptr;
1327 if (HasStreamOutT::value)
1328 {
1329 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1330 }
1331
1332 // choose primitive assembler
1333 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1334 PA_STATE& pa = paFactory.GetPA();
1335
1336 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1337 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1338 {
1339 simdscalari vIndex;
1340 uint32_t i = 0;
1341
1342 if (IsIndexedT::value)
1343 {
1344 fetchInfo.pIndices = work.pIB;
1345 }
1346 else
1347 {
1348 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1349 fetchInfo.pIndices = (const int32_t*)&vIndex;
1350 }
1351
1352 fetchInfo.CurInstance = instanceNum;
1353 vsContext.InstanceID = instanceNum;
1354
1355 while (pa.HasWork())
1356 {
1357 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1358 // So we need to keep this outside of (i < endVertex) check.
1359 simdmask* pvCutIndices = nullptr;
1360 if (IsIndexedT::value)
1361 {
1362 pvCutIndices = &pa.GetNextVsIndices();
1363 }
1364
1365 simdvertex& vout = pa.GetNextVsOutput();
1366 vsContext.pVout = &vout;
1367
1368 if (i < endVertex)
1369 {
1370
1371 // 1. Execute FS/VS for a single SIMD.
1372 AR_BEGIN(FEFetchShader, pDC->drawId);
1373 state.pfnFetchFunc(fetchInfo, vin);
1374 AR_END(FEFetchShader, 0);
1375
1376 // forward fetch generated vertex IDs to the vertex shader
1377 vsContext.VertexID = fetchInfo.VertexID;
1378
1379 // Setup active mask for vertex shader.
1380 vsContext.mask = GenerateMask(endVertex - i);
1381
1382 // forward cut mask to the PA
1383 if (IsIndexedT::value)
1384 {
1385 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1386 }
1387
1388 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1389
1390 #if KNOB_ENABLE_TOSS_POINTS
1391 if (!KNOB_TOSS_FETCH)
1392 #endif
1393 {
1394 AR_BEGIN(FEVertexShader, pDC->drawId);
1395 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1396 AR_END(FEVertexShader, 0);
1397
1398 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1399 }
1400 }
1401
1402 // 2. Assemble primitives given the last two SIMD.
1403 do
1404 {
1405 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1406 // PaAssemble returns false if there is not enough verts to assemble.
1407 AR_BEGIN(FEPAAssemble, pDC->drawId);
1408 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1409 AR_END(FEPAAssemble, 1);
1410
1411 #if KNOB_ENABLE_TOSS_POINTS
1412 if (!KNOB_TOSS_FETCH)
1413 #endif
1414 {
1415 #if KNOB_ENABLE_TOSS_POINTS
1416 if (!KNOB_TOSS_VS)
1417 #endif
1418 {
1419 if (assemble)
1420 {
1421 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1422
1423 if (HasTessellationT::value)
1424 {
1425 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1426 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1427 }
1428 else if (HasGeometryShaderT::value)
1429 {
1430 GeometryShaderStage<HasStreamOutT, HasRastT>(
1431 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1432 }
1433 else
1434 {
1435 // If streamout is enabled then stream vertices out to memory.
1436 if (HasStreamOutT::value)
1437 {
1438 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1439 }
1440
1441 if (HasRastT::value)
1442 {
1443 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1444 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1445 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1446 }
1447 }
1448 }
1449 }
1450 }
1451 } while (pa.NextPrim());
1452
1453 i += KNOB_SIMD_WIDTH;
1454 if (IsIndexedT::value)
1455 {
1456 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1457 }
1458 else
1459 {
1460 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1461 }
1462 }
1463 pa.Reset();
1464 }
1465
1466 AR_END(FEProcessDraw, numPrims * work.numInstances);
1467 }
1468
1469 struct FEDrawChooser
1470 {
1471 typedef PFN_FE_WORK_FUNC FuncType;
1472
1473 template <typename... ArgsB>
1474 static FuncType GetFunc()
1475 {
1476 return ProcessDraw<ArgsB...>;
1477 }
1478 };
1479
1480
1481 // Selector for correct templated Draw front-end function
1482 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1483 bool IsIndexed,
1484 bool IsCutIndexEnabled,
1485 bool HasTessellation,
1486 bool HasGeometryShader,
1487 bool HasStreamOut,
1488 bool HasRasterization)
1489 {
1490 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1491 }
1492
1493 //////////////////////////////////////////////////////////////////////////
1494 /// @brief Processes attributes for the backend based on linkage mask and
1495 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1496 /// @param pDC - Draw context
1497 /// @param pa - Primitive Assembly state
1498 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1499 /// @param pLinkageMap - maps VS attribute slot to PS slot
1500 /// @param triIndex - Triangle to process attributes for
1501 /// @param pBuffer - Output result
1502 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1503 INLINE void ProcessAttributes(
1504 DRAW_CONTEXT *pDC,
1505 PA_STATE&pa,
1506 uint32_t triIndex,
1507 uint32_t primId,
1508 float *pBuffer)
1509 {
1510 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1511 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1512 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1513 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1514 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1515 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1516
1517 static const float constTable[3][4] = {
1518 {0.0f, 0.0f, 0.0f, 0.0f},
1519 {0.0f, 0.0f, 0.0f, 1.0f},
1520 {1.0f, 1.0f, 1.0f, 1.0f}
1521 };
1522
1523 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1524 {
1525 uint32_t inputSlot;
1526 if (IsSwizzledT::value)
1527 {
1528 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1529 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1530
1531 }
1532 else
1533 {
1534 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1535 }
1536
1537 __m128 attrib[3]; // triangle attribs (always 4 wide)
1538 float* pAttribStart = pBuffer;
1539
1540 if (HasConstantInterpT::value || IsDegenerate::value)
1541 {
1542 if (_bittest(&constantInterpMask, i))
1543 {
1544 uint32_t vid;
1545 uint32_t adjustedTriIndex;
1546 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1547 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1548 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1549 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1550 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1551
1552 switch (topo) {
1553 case TOP_QUAD_LIST:
1554 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1555 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1556 break;
1557 case TOP_QUAD_STRIP:
1558 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1559 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1560 break;
1561 case TOP_TRIANGLE_STRIP:
1562 adjustedTriIndex = triIndex;
1563 vid = (triIndex & 1)
1564 ? tristripProvokingVertex[provokingVertex]
1565 : provokingVertex;
1566 break;
1567 default:
1568 adjustedTriIndex = triIndex;
1569 vid = provokingVertex;
1570 break;
1571 }
1572
1573 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1574
1575 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1576 {
1577 _mm_store_ps(pBuffer, attrib[vid]);
1578 pBuffer += 4;
1579 }
1580 }
1581 else
1582 {
1583 pa.AssembleSingle(inputSlot, triIndex, attrib);
1584
1585 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1586 {
1587 _mm_store_ps(pBuffer, attrib[i]);
1588 pBuffer += 4;
1589 }
1590 }
1591 }
1592 else
1593 {
1594 pa.AssembleSingle(inputSlot, triIndex, attrib);
1595
1596 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1597 {
1598 _mm_store_ps(pBuffer, attrib[i]);
1599 pBuffer += 4;
1600 }
1601 }
1602
1603 // pad out the attrib buffer to 3 verts to ensure the triangle
1604 // interpolation code in the pixel shader works correctly for the
1605 // 3 topologies - point, line, tri. This effectively zeros out the
1606 // effect of the missing vertices in the triangle interpolation.
1607 for (uint32_t v = NumVertsT::value; v < 3; ++v)
1608 {
1609 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1610 pBuffer += 4;
1611 }
1612
1613 // check for constant source overrides
1614 if (IsSwizzledT::value)
1615 {
1616 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1617 if (mask)
1618 {
1619 DWORD comp;
1620 while (_BitScanForward(&comp, mask))
1621 {
1622 mask &= ~(1 << comp);
1623
1624 float constantValue = 0.0f;
1625 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1626 {
1627 case SWR_CONSTANT_SOURCE_CONST_0000:
1628 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1629 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1630 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1631 break;
1632 case SWR_CONSTANT_SOURCE_PRIM_ID:
1633 constantValue = *(float*)&primId;
1634 break;
1635 }
1636
1637 // apply constant value to all 3 vertices
1638 for (uint32_t v = 0; v < 3; ++v)
1639 {
1640 pAttribStart[comp + v * 4] = constantValue;
1641 }
1642 }
1643 }
1644 }
1645 }
1646 }
1647
1648
1649 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1650
1651 struct ProcessAttributesChooser
1652 {
1653 typedef PFN_PROCESS_ATTRIBUTES FuncType;
1654
1655 template <typename... ArgsB>
1656 static FuncType GetFunc()
1657 {
1658 return ProcessAttributes<ArgsB...>;
1659 }
1660 };
1661
1662 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1663 {
1664 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1665 }
1666
1667 //////////////////////////////////////////////////////////////////////////
1668 /// @brief Processes enabled user clip distances. Loads the active clip
1669 /// distances from the PA, sets up barycentric equations, and
1670 /// stores the results to the output buffer
1671 /// @param pa - Primitive Assembly state
1672 /// @param primIndex - primitive index to process
1673 /// @param clipDistMask - mask of enabled clip distances
1674 /// @param pUserClipBuffer - buffer to store results
1675 template<uint32_t NumVerts>
1676 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1677 {
1678 DWORD clipDist;
1679 while (_BitScanForward(&clipDist, clipDistMask))
1680 {
1681 clipDistMask &= ~(1 << clipDist);
1682 uint32_t clipSlot = clipDist >> 2;
1683 uint32_t clipComp = clipDist & 0x3;
1684 uint32_t clipAttribSlot = clipSlot == 0 ?
1685 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1686
1687 __m128 primClipDist[3];
1688 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1689
1690 float vertClipDist[NumVerts];
1691 for (uint32_t e = 0; e < NumVerts; ++e)
1692 {
1693 OSALIGNSIMD(float) aVertClipDist[4];
1694 _mm_store_ps(aVertClipDist, primClipDist[e]);
1695 vertClipDist[e] = aVertClipDist[clipComp];
1696 };
1697
1698 // setup plane equations for barycentric interpolation in the backend
1699 float baryCoeff[NumVerts];
1700 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1701 {
1702 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1703 }
1704 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1705
1706 for (uint32_t e = 0; e < NumVerts; ++e)
1707 {
1708 *(pUserClipBuffer++) = baryCoeff[e];
1709 }
1710 }
1711 }
1712
1713 //////////////////////////////////////////////////////////////////////////
1714 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1715 /// Point precision from FP32.
1716 template <typename PT = FixedPointTraits<Fixed_16_8>>
1717 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1718 {
1719 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1720 return _simd_cvtps_epi32(vFixed);
1721 }
1722
1723 //////////////////////////////////////////////////////////////////////////
1724 /// @brief Helper function to set the X,Y coords of a triangle to the
1725 /// requested Fixed Point precision from FP32.
1726 /// @param tri: simdvector[3] of FP triangle verts
1727 /// @param vXi: fixed point X coords of tri verts
1728 /// @param vYi: fixed point Y coords of tri verts
1729 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1730 {
1731 vXi[0] = fpToFixedPointVertical(tri[0].x);
1732 vYi[0] = fpToFixedPointVertical(tri[0].y);
1733 vXi[1] = fpToFixedPointVertical(tri[1].x);
1734 vYi[1] = fpToFixedPointVertical(tri[1].y);
1735 vXi[2] = fpToFixedPointVertical(tri[2].x);
1736 vYi[2] = fpToFixedPointVertical(tri[2].y);
1737 }
1738
1739 //////////////////////////////////////////////////////////////////////////
1740 /// @brief Calculate bounding box for current triangle
1741 /// @tparam CT: ConservativeRastFETraits type
1742 /// @param vX: fixed point X position for triangle verts
1743 /// @param vY: fixed point Y position for triangle verts
1744 /// @param bbox: fixed point bbox
1745 /// *Note*: expects vX, vY to be in the correct precision for the type
1746 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1747 template <typename CT>
1748 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1749 {
1750 simdscalari vMinX = vX[0];
1751 vMinX = _simd_min_epi32(vMinX, vX[1]);
1752 vMinX = _simd_min_epi32(vMinX, vX[2]);
1753
1754 simdscalari vMaxX = vX[0];
1755 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1756 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1757
1758 simdscalari vMinY = vY[0];
1759 vMinY = _simd_min_epi32(vMinY, vY[1]);
1760 vMinY = _simd_min_epi32(vMinY, vY[2]);
1761
1762 simdscalari vMaxY = vY[0];
1763 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1764 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1765
1766 bbox.xmin = vMinX;
1767 bbox.xmax = vMaxX;
1768 bbox.ymin = vMinY;
1769 bbox.ymax = vMaxY;
1770 }
1771
1772 //////////////////////////////////////////////////////////////////////////
1773 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1774 /// Offsets BBox for conservative rast
1775 template <>
1776 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1777 {
1778 // FE conservative rast traits
1779 typedef FEConservativeRastT CT;
1780
1781 simdscalari vMinX = vX[0];
1782 vMinX = _simd_min_epi32(vMinX, vX[1]);
1783 vMinX = _simd_min_epi32(vMinX, vX[2]);
1784
1785 simdscalari vMaxX = vX[0];
1786 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1787 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1788
1789 simdscalari vMinY = vY[0];
1790 vMinY = _simd_min_epi32(vMinY, vY[1]);
1791 vMinY = _simd_min_epi32(vMinY, vY[2]);
1792
1793 simdscalari vMaxY = vY[0];
1794 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1795 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1796
1797 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1798 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1799 bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1800 bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1801 bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1802 bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1803 }
1804
1805 //////////////////////////////////////////////////////////////////////////
1806 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1807 /// culling, viewport transform, etc.
1808 /// @param pDC - pointer to draw context.
1809 /// @param pa - The primitive assembly object.
1810 /// @param workerId - thread's worker id. Even thread has a unique id.
1811 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1812 /// @param primID - Primitive ID for each triangle.
1813 /// @param viewportIdx - viewport array index for each triangle.
1814 /// @tparam CT - ConservativeRastFETraits
1815 template <typename CT>
1816 void BinTriangles(
1817 DRAW_CONTEXT *pDC,
1818 PA_STATE& pa,
1819 uint32_t workerId,
1820 simdvector tri[3],
1821 uint32_t triMask,
1822 simdscalari primID,
1823 simdscalari viewportIdx)
1824 {
1825 SWR_CONTEXT *pContext = pDC->pContext;
1826
1827 AR_BEGIN(FEBinTriangles, pDC->drawId);
1828
1829 const API_STATE& state = GetApiState(pDC);
1830 const SWR_RASTSTATE& rastState = state.rastState;
1831 const SWR_FRONTEND_STATE& feState = state.frontendState;
1832 const SWR_GS_STATE& gsState = state.gsState;
1833 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1834
1835
1836 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1837 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1838 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1839
1840 if (feState.vpTransformDisable)
1841 {
1842 // RHW is passed in directly when VP transform is disabled
1843 vRecipW0 = tri[0].v[3];
1844 vRecipW1 = tri[1].v[3];
1845 vRecipW2 = tri[2].v[3];
1846 }
1847 else
1848 {
1849 // Perspective divide
1850 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1851 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1852 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1853
1854 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1855 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1856 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1857
1858 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1859 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1860 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1861
1862 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1863 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1864 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1865
1866 // Viewport transform to screen space coords
1867 if (state.gsState.emitsViewportArrayIndex)
1868 {
1869 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1870 }
1871 else
1872 {
1873 viewportTransform<3>(tri, state.vpMatrices);
1874 }
1875 }
1876
1877 // Adjust for pixel center location
1878 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1879 tri[0].x = _simd_add_ps(tri[0].x, offset);
1880 tri[0].y = _simd_add_ps(tri[0].y, offset);
1881
1882 tri[1].x = _simd_add_ps(tri[1].x, offset);
1883 tri[1].y = _simd_add_ps(tri[1].y, offset);
1884
1885 tri[2].x = _simd_add_ps(tri[2].x, offset);
1886 tri[2].y = _simd_add_ps(tri[2].y, offset);
1887
1888 simdscalari vXi[3], vYi[3];
1889 // Set vXi, vYi to required fixed point precision
1890 FPToFixedPoint(tri, vXi, vYi);
1891
1892 // triangle setup
1893 simdscalari vAi[3], vBi[3];
1894 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1895
1896 // determinant
1897 simdscalari vDet[2];
1898 calcDeterminantIntVertical(vAi, vBi, vDet);
1899
1900 // cull zero area
1901 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1902 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1903
1904 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1905
1906 uint32_t origTriMask = triMask;
1907 // don't cull degenerate triangles if we're conservatively rasterizing
1908 if(!CT::IsConservativeT::value)
1909 {
1910 triMask &= ~cullZeroAreaMask;
1911 }
1912
1913 // determine front winding tris
1914 // CW +det
1915 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1916 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1917 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1918 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1919
1920 uint32_t frontWindingTris;
1921 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1922 {
1923 frontWindingTris = cwTriMask;
1924 }
1925 else
1926 {
1927 frontWindingTris = ~cwTriMask;
1928 }
1929
1930 // cull
1931 uint32_t cullTris;
1932 switch ((SWR_CULLMODE)rastState.cullMode)
1933 {
1934 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1935 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1936 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1937 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1938 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1939 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1940 }
1941
1942 triMask &= ~cullTris;
1943
1944 if (origTriMask ^ triMask)
1945 {
1946 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1947 }
1948
1949 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1950 // compute per tri backface
1951 uint32_t frontFaceMask = frontWindingTris;
1952 uint32_t *pPrimID = (uint32_t *)&primID;
1953 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1954 DWORD triIndex = 0;
1955 // for center sample pattern, all samples are at pixel center; calculate coverage
1956 // once at center and broadcast the results in the backend
1957 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1958 uint32_t edgeEnable;
1959 PFN_WORK_FUNC pfnWork;
1960 if(CT::IsConservativeT::value)
1961 {
1962 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1963 // used to call the appropriate templated rasterizer function
1964 if(cullZeroAreaMask > 0)
1965 {
1966 // e0 = v1-v0
1967 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1968 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1969 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1970
1971 // e1 = v2-v1
1972 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1973 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1974 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1975
1976 // e2 = v0-v2
1977 // if v0 == v1 & v1 == v2, v0 == v2
1978 uint32_t e2Mask = e0Mask & e1Mask;
1979 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1980
1981 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1982 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1983 e0Mask = pdep_u32(e0Mask, 0x00249249);
1984 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1985 e1Mask = pdep_u32(e1Mask, 0x00492492);
1986 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1987 e2Mask = pdep_u32(e2Mask, 0x00924924);
1988
1989 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1990 }
1991 else
1992 {
1993 edgeEnable = 0x00FFFFFF;
1994 }
1995 }
1996 else
1997 {
1998 // degenerate triangles won't be sent to rasterizer; just enable all edges
1999 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2000 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
2001 (state.scissorsTileAligned == false));
2002 }
2003
2004 if (!triMask)
2005 {
2006 goto endBinTriangles;
2007 }
2008
2009 // Calc bounding box of triangles
2010 simdBBox bbox;
2011 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
2012
2013 // determine if triangle falls between pixel centers and discard
2014 // only discard for non-MSAA case and when conservative rast is disabled
2015 // (xmin + 127) & ~255
2016 // (xmax + 128) & ~255
2017 if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
2018 {
2019 origTriMask = triMask;
2020
2021 int cullCenterMask;
2022 {
2023 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
2024 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
2025 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
2026 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
2027
2028 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
2029
2030 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
2031 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
2032 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
2033 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
2034
2035 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
2036 vMaskV = _simd_or_si(vMaskH, vMaskV);
2037 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
2038 }
2039
2040 triMask &= ~cullCenterMask;
2041
2042 if(origTriMask ^ triMask)
2043 {
2044 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
2045 }
2046 }
2047
2048 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2049 // Gather the AOS effective scissor rects based on the per-prim VP index.
2050 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2051 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2052 if (state.gsState.emitsViewportArrayIndex)
2053 {
2054 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2055 scisXmin, scisYmin, scisXmax, scisYmax);
2056 }
2057 else // broadcast fast path for non-VPAI case.
2058 {
2059 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2060 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2061 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2062 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2063 }
2064
2065 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2066 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2067 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2068 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2069
2070 if(CT::IsConservativeT::value)
2071 {
2072 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
2073 // some area. Bump the xmax/ymax edges out
2074 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
2075 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
2076 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
2077 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
2078 }
2079
2080 // Cull tris completely outside scissor
2081 {
2082 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2083 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2084 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2085 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2086 triMask = triMask & ~maskOutsideScissor;
2087 }
2088
2089 if (!triMask)
2090 {
2091 goto endBinTriangles;
2092 }
2093
2094 // Convert triangle bbox to macrotile units.
2095 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2096 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2097 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2098 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2099
2100 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2101 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2102 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2103 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2104 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2105
2106 // transpose verts needed for backend
2107 /// @todo modify BE to take non-transformed verts
2108 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2109 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2110 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2111 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2112 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2113
2114 // store render target array index
2115 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2116 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2117 {
2118 simdvector vRtai[3];
2119 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2120 simdscalari vRtaii;
2121 vRtaii = _simd_castps_si(vRtai[0].x);
2122 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2123 }
2124 else
2125 {
2126 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2127 }
2128
2129 // scan remaining valid triangles and bin each separately
2130 while (_BitScanForward(&triIndex, triMask))
2131 {
2132 uint32_t linkageCount = state.backendState.numAttributes;
2133 uint32_t numScalarAttribs = linkageCount * 4;
2134
2135 BE_WORK work;
2136 work.type = DRAW;
2137
2138 bool isDegenerate;
2139 if(CT::IsConservativeT::value)
2140 {
2141 // only rasterize valid edges if we have a degenerate primitive
2142 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2143 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2144 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2145 (state.scissorsTileAligned == false));
2146
2147 // Degenerate triangles are required to be constant interpolated
2148 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2149 }
2150 else
2151 {
2152 isDegenerate = false;
2153 work.pfnWork = pfnWork;
2154 }
2155
2156 // Select attribute processor
2157 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2158 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
2159
2160 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2161
2162 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2163 desc.triFlags.primID = pPrimID[triIndex];
2164 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2165 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
2166
2167 auto pArena = pDC->pArena;
2168 SWR_ASSERT(pArena != nullptr);
2169
2170 // store active attribs
2171 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2172 desc.pAttribs = pAttribs;
2173 desc.numAttribs = linkageCount;
2174 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2175
2176 // store triangle vertex data
2177 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2178
2179 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2180 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2181 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2182 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2183
2184 // store user clip distances
2185 if (rastState.clipDistanceMask)
2186 {
2187 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2188 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2189 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2190 }
2191
2192 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2193 {
2194 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2195 {
2196 #if KNOB_ENABLE_TOSS_POINTS
2197 if (!KNOB_TOSS_SETUP_TRIS)
2198 #endif
2199 {
2200 pTileMgr->enqueue(x, y, &work);
2201 }
2202 }
2203 }
2204 triMask &= ~(1 << triIndex);
2205 }
2206
2207 endBinTriangles:
2208 AR_END(FEBinTriangles, 1);
2209 }
2210
2211 struct FEBinTrianglesChooser
2212 {
2213 typedef PFN_PROCESS_PRIMS FuncType;
2214
2215 template <typename... ArgsB>
2216 static FuncType GetFunc()
2217 {
2218 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2219 }
2220 };
2221
2222 // Selector for correct templated BinTrinagles function
2223 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2224 {
2225 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2226 }
2227
2228 //////////////////////////////////////////////////////////////////////////
2229 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2230 /// @param pDC - pointer to draw context.
2231 /// @param pa - The primitive assembly object.
2232 /// @param workerId - thread's worker id. Even thread has a unique id.
2233 /// @param tri - Contains point position data for SIMDs worth of points.
2234 /// @param primID - Primitive ID for each point.
2235 void BinPoints(
2236 DRAW_CONTEXT *pDC,
2237 PA_STATE& pa,
2238 uint32_t workerId,
2239 simdvector prim[3],
2240 uint32_t primMask,
2241 simdscalari primID,
2242 simdscalari viewportIdx)
2243 {
2244 SWR_CONTEXT *pContext = pDC->pContext;
2245
2246 AR_BEGIN(FEBinPoints, pDC->drawId);
2247
2248 simdvector& primVerts = prim[0];
2249
2250 const API_STATE& state = GetApiState(pDC);
2251 const SWR_FRONTEND_STATE& feState = state.frontendState;
2252 const SWR_GS_STATE& gsState = state.gsState;
2253 const SWR_RASTSTATE& rastState = state.rastState;
2254 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2255
2256 // Select attribute processor
2257 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2258 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2259
2260 if (!feState.vpTransformDisable)
2261 {
2262 // perspective divide
2263 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2264 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2265 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2266 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2267
2268 // viewport transform to screen coords
2269 if (state.gsState.emitsViewportArrayIndex)
2270 {
2271 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2272 }
2273 else
2274 {
2275 viewportTransform<1>(&primVerts, state.vpMatrices);
2276 }
2277 }
2278
2279 // adjust for pixel center location
2280 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2281 primVerts.x = _simd_add_ps(primVerts.x, offset);
2282 primVerts.y = _simd_add_ps(primVerts.y, offset);
2283
2284 // convert to fixed point
2285 simdscalari vXi, vYi;
2286 vXi = fpToFixedPointVertical(primVerts.x);
2287 vYi = fpToFixedPointVertical(primVerts.y);
2288
2289 if (CanUseSimplePoints(pDC))
2290 {
2291 // adjust for ymin-xmin rule
2292 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2293 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2294
2295 // cull points off the ymin-xmin edge of the viewport
2296 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2297 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2298
2299 // compute macro tile coordinates
2300 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2301 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2302
2303 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2304 _simd_store_si((simdscalari*)aMacroX, macroX);
2305 _simd_store_si((simdscalari*)aMacroY, macroY);
2306
2307 // compute raster tile coordinates
2308 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2309 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2310
2311 // compute raster tile relative x,y for coverage mask
2312 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2313 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2314
2315 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2316 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2317
2318 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2319 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2320 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2321 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2322
2323 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2324 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2325 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2326 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2327
2328 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2329 _simd_store_ps((float*)aZ, primVerts.z);
2330
2331 // store render target array index
2332 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2333 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2334 {
2335 simdvector vRtai;
2336 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2337 simdscalari vRtaii = _simd_castps_si(vRtai.x);
2338 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2339 }
2340 else
2341 {
2342 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2343 }
2344
2345 uint32_t *pPrimID = (uint32_t *)&primID;
2346 DWORD primIndex = 0;
2347
2348 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2349
2350 // scan remaining valid triangles and bin each separately
2351 while (_BitScanForward(&primIndex, primMask))
2352 {
2353 uint32_t linkageCount = backendState.numAttributes;
2354 uint32_t numScalarAttribs = linkageCount * 4;
2355
2356 BE_WORK work;
2357 work.type = DRAW;
2358
2359 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2360
2361 // points are always front facing
2362 desc.triFlags.frontFacing = 1;
2363 desc.triFlags.primID = pPrimID[primIndex];
2364 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2365 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2366
2367 work.pfnWork = RasterizeSimplePoint;
2368
2369 auto pArena = pDC->pArena;
2370 SWR_ASSERT(pArena != nullptr);
2371
2372 // store attributes
2373 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2374 desc.pAttribs = pAttribs;
2375 desc.numAttribs = linkageCount;
2376
2377 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2378
2379 // store raster tile aligned x, y, perspective correct z
2380 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2381 desc.pTriBuffer = pTriBuffer;
2382 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2383 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2384 *pTriBuffer = aZ[primIndex];
2385
2386 uint32_t tX = aTileRelativeX[primIndex];
2387 uint32_t tY = aTileRelativeY[primIndex];
2388
2389 // pack the relative x,y into the coverageMask, the rasterizer will
2390 // generate the true coverage mask from it
2391 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2392
2393 // bin it
2394 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2395 #if KNOB_ENABLE_TOSS_POINTS
2396 if (!KNOB_TOSS_SETUP_TRIS)
2397 #endif
2398 {
2399 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2400 }
2401 primMask &= ~(1 << primIndex);
2402 }
2403 }
2404 else
2405 {
2406 // non simple points need to be potentially binned to multiple macro tiles
2407 simdscalar vPointSize;
2408 if (rastState.pointParam)
2409 {
2410 simdvector size[3];
2411 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2412 vPointSize = size[0].x;
2413 }
2414 else
2415 {
2416 vPointSize = _simd_set1_ps(rastState.pointSize);
2417 }
2418
2419 // bloat point to bbox
2420 simdBBox bbox;
2421 bbox.xmin = bbox.xmax = vXi;
2422 bbox.ymin = bbox.ymax = vYi;
2423
2424 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2425 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2426 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2427 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2428 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2429 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2430
2431 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2432 // Gather the AOS effective scissor rects based on the per-prim VP index.
2433 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2434 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2435 if (state.gsState.emitsViewportArrayIndex)
2436 {
2437 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2438 scisXmin, scisYmin, scisXmax, scisYmax);
2439 }
2440 else // broadcast fast path for non-VPAI case.
2441 {
2442 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2443 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2444 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2445 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2446 }
2447
2448 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2449 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2450 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2451 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2452
2453 // Cull bloated points completely outside scissor
2454 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2455 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2456 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2457 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2458 primMask = primMask & ~maskOutsideScissor;
2459
2460 // Convert bbox to macrotile units.
2461 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2462 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2463 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2464 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2465
2466 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2467 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2468 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2469 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2470 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2471
2472 // store render target array index
2473 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2474 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2475 {
2476 simdvector vRtai[2];
2477 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2478 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2479 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2480 }
2481 else
2482 {
2483 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2484 }
2485
2486 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2487 _simd_store_ps((float*)aPointSize, vPointSize);
2488
2489 uint32_t *pPrimID = (uint32_t *)&primID;
2490
2491 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2492 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2493 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2494
2495 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2496 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2497 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2498
2499 // scan remaining valid prims and bin each separately
2500 const SWR_BACKEND_STATE& backendState = state.backendState;
2501 DWORD primIndex;
2502 while (_BitScanForward(&primIndex, primMask))
2503 {
2504 uint32_t linkageCount = backendState.numAttributes;
2505 uint32_t numScalarAttribs = linkageCount * 4;
2506
2507 BE_WORK work;
2508 work.type = DRAW;
2509
2510 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2511
2512 desc.triFlags.frontFacing = 1;
2513 desc.triFlags.primID = pPrimID[primIndex];
2514 desc.triFlags.pointSize = aPointSize[primIndex];
2515 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2516 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2517
2518 work.pfnWork = RasterizeTriPoint;
2519
2520 auto pArena = pDC->pArena;
2521 SWR_ASSERT(pArena != nullptr);
2522
2523 // store active attribs
2524 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2525 desc.numAttribs = linkageCount;
2526 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2527
2528 // store point vertex data
2529 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2530 desc.pTriBuffer = pTriBuffer;
2531 *pTriBuffer++ = aPrimVertsX[primIndex];
2532 *pTriBuffer++ = aPrimVertsY[primIndex];
2533 *pTriBuffer = aPrimVertsZ[primIndex];
2534
2535 // store user clip distances
2536 if (rastState.clipDistanceMask)
2537 {
2538 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2539 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2540 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2541 }
2542
2543 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2544 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2545 {
2546 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2547 {
2548 #if KNOB_ENABLE_TOSS_POINTS
2549 if (!KNOB_TOSS_SETUP_TRIS)
2550 #endif
2551 {
2552 pTileMgr->enqueue(x, y, &work);
2553 }
2554 }
2555 }
2556
2557 primMask &= ~(1 << primIndex);
2558 }
2559 }
2560
2561 AR_END(FEBinPoints, 1);
2562 }
2563
2564 //////////////////////////////////////////////////////////////////////////
2565 /// @brief Bin SIMD lines to the backend.
2566 /// @param pDC - pointer to draw context.
2567 /// @param pa - The primitive assembly object.
2568 /// @param workerId - thread's worker id. Even thread has a unique id.
2569 /// @param tri - Contains line position data for SIMDs worth of points.
2570 /// @param primID - Primitive ID for each line.
2571 /// @param viewportIdx - Viewport Array Index for each line.
2572 void BinLines(
2573 DRAW_CONTEXT *pDC,
2574 PA_STATE& pa,
2575 uint32_t workerId,
2576 simdvector prim[],
2577 uint32_t primMask,
2578 simdscalari primID,
2579 simdscalari viewportIdx)
2580 {
2581 SWR_CONTEXT *pContext = pDC->pContext;
2582
2583 AR_BEGIN(FEBinLines, pDC->drawId);
2584
2585 const API_STATE& state = GetApiState(pDC);
2586 const SWR_RASTSTATE& rastState = state.rastState;
2587 const SWR_FRONTEND_STATE& feState = state.frontendState;
2588 const SWR_GS_STATE& gsState = state.gsState;
2589
2590 // Select attribute processor
2591 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2592 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2593
2594 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2595 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2596
2597 if (!feState.vpTransformDisable)
2598 {
2599 // perspective divide
2600 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2601 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2602
2603 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2604 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2605
2606 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2607 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2608
2609 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2610 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2611
2612 // viewport transform to screen coords
2613 if (state.gsState.emitsViewportArrayIndex)
2614 {
2615 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2616 }
2617 else
2618 {
2619 viewportTransform<2>(prim, state.vpMatrices);
2620 }
2621 }
2622
2623 // adjust for pixel center location
2624 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2625 prim[0].x = _simd_add_ps(prim[0].x, offset);
2626 prim[0].y = _simd_add_ps(prim[0].y, offset);
2627
2628 prim[1].x = _simd_add_ps(prim[1].x, offset);
2629 prim[1].y = _simd_add_ps(prim[1].y, offset);
2630
2631 // convert to fixed point
2632 simdscalari vXi[2], vYi[2];
2633 vXi[0] = fpToFixedPointVertical(prim[0].x);
2634 vYi[0] = fpToFixedPointVertical(prim[0].y);
2635 vXi[1] = fpToFixedPointVertical(prim[1].x);
2636 vYi[1] = fpToFixedPointVertical(prim[1].y);
2637
2638 // compute x-major vs y-major mask
2639 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2640 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2641 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2642 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2643
2644 // cull zero-length lines
2645 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2646 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2647
2648 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2649
2650 uint32_t *pPrimID = (uint32_t *)&primID;
2651 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2652
2653 simdscalar vUnused = _simd_setzero_ps();
2654
2655 // Calc bounding box of lines
2656 simdBBox bbox;
2657 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2658 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2659 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2660 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2661
2662 // bloat bbox by line width along minor axis
2663 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2664 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2665 simdBBox bloatBox;
2666 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2667 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2668 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2669 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2670
2671 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2672 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2673 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2674 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2675
2676 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2677 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2678 if (state.gsState.emitsViewportArrayIndex)
2679 {
2680 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2681 scisXmin, scisYmin, scisXmax, scisYmax);
2682 }
2683 else // broadcast fast path for non-VPAI case.
2684 {
2685 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2686 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2687 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2688 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2689 }
2690
2691 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2692 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2693 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2694 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2695
2696 // Cull prims completely outside scissor
2697 {
2698 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2699 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2700 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2701 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2702 primMask = primMask & ~maskOutsideScissor;
2703 }
2704
2705 if (!primMask)
2706 {
2707 goto endBinLines;
2708 }
2709
2710 // Convert triangle bbox to macrotile units.
2711 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2712 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2713 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2714 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2715
2716 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2717 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2718 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2719 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2720 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2721
2722 // transpose verts needed for backend
2723 /// @todo modify BE to take non-transformed verts
2724 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2725 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2726 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2727 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2728 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2729
2730 // store render target array index
2731 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2732 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2733 {
2734 simdvector vRtai[2];
2735 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2736 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2737 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2738 }
2739 else
2740 {
2741 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2742 }
2743
2744 // scan remaining valid prims and bin each separately
2745 DWORD primIndex;
2746 while (_BitScanForward(&primIndex, primMask))
2747 {
2748 uint32_t linkageCount = state.backendState.numAttributes;
2749 uint32_t numScalarAttribs = linkageCount * 4;
2750
2751 BE_WORK work;
2752 work.type = DRAW;
2753
2754 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2755
2756 desc.triFlags.frontFacing = 1;
2757 desc.triFlags.primID = pPrimID[primIndex];
2758 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2759 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2760 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2761
2762 work.pfnWork = RasterizeLine;
2763
2764 auto pArena = pDC->pArena;
2765 SWR_ASSERT(pArena != nullptr);
2766
2767 // store active attribs
2768 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2769 desc.numAttribs = linkageCount;
2770 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2771
2772 // store line vertex data
2773 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2774 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2775 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2776 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2777 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2778
2779 // store user clip distances
2780 if (rastState.clipDistanceMask)
2781 {
2782 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2783 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2784 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2785 }
2786
2787 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2788 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2789 {
2790 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2791 {
2792 #if KNOB_ENABLE_TOSS_POINTS
2793 if (!KNOB_TOSS_SETUP_TRIS)
2794 #endif
2795 {
2796 pTileMgr->enqueue(x, y, &work);
2797 }
2798 }
2799 }
2800
2801 primMask &= ~(1 << primIndex);
2802 }
2803
2804 endBinLines:
2805
2806 AR_END(FEBinLines, 1);
2807 }