swr: [rasterizer core] Add macros for mapping ArchRast to buckets
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
37 #include "utils.h"
38 #include "threads.h"
39 #include "pa.h"
40 #include "clip.h"
41 #include "tilemgr.h"
42 #include "tessellator.h"
43 #include <limits>
44
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE uint32_t GenMask(uint32_t numBits)
48 {
49 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
55 /// raster state.
56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
69 void ProcessSync(
70 SWR_CONTEXT *pContext,
71 DRAW_CONTEXT *pDC,
72 uint32_t workerId,
73 void *pUserData)
74 {
75 BE_WORK work;
76 work.type = SYNC;
77 work.pfnWork = ProcessSyncBE;
78
79 MacroTileMgr *pTileMgr = pDC->pTileMgr;
80 pTileMgr->enqueue(0, 0, &work);
81 }
82
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrClearRenderTarget.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to clear callback.
89 /// @todo This should go away when we switch this to use compute threading.
90 void ProcessClear(
91 SWR_CONTEXT *pContext,
92 DRAW_CONTEXT *pDC,
93 uint32_t workerId,
94 void *pUserData)
95 {
96 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
97 MacroTileMgr *pTileMgr = pDC->pTileMgr;
98
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
102 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
103 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
104 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
105
106 BE_WORK work;
107 work.type = CLEAR;
108 work.pfnWork = ProcessClearBE;
109 work.desc.clear = *pDesc;
110
111 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
112 {
113 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
114 {
115 pTileMgr->enqueue(x, y, &work);
116 }
117 }
118 }
119
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
127 void ProcessStoreTiles(
128 SWR_CONTEXT *pContext,
129 DRAW_CONTEXT *pDC,
130 uint32_t workerId,
131 void *pUserData)
132 {
133 AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
134 MacroTileMgr *pTileMgr = pDC->pTileMgr;
135 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
136
137 // queue a store to each macro tile
138 // compute macro tile bounds for the specified rect
139 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
140 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
141 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
142 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
143
144 // store tiles
145 BE_WORK work;
146 work.type = STORETILES;
147 work.pfnWork = ProcessStoreTileBE;
148 work.desc.storeTiles = *pDesc;
149
150 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
151 {
152 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
153 {
154 pTileMgr->enqueue(x, y, &work);
155 }
156 }
157
158 AR_END(FEProcessStoreTiles, 0);
159 }
160
161 //////////////////////////////////////////////////////////////////////////
162 /// @brief FE handler for SwrInvalidateTiles.
163 /// @param pContext - pointer to SWR context.
164 /// @param pDC - pointer to draw context.
165 /// @param workerId - thread's worker id. Even thread has a unique id.
166 /// @param pUserData - Pointer to user data passed back to callback.
167 /// @todo This should go away when we switch this to use compute threading.
168 void ProcessDiscardInvalidateTiles(
169 SWR_CONTEXT *pContext,
170 DRAW_CONTEXT *pDC,
171 uint32_t workerId,
172 void *pUserData)
173 {
174 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
175 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
176 MacroTileMgr *pTileMgr = pDC->pTileMgr;
177
178 // compute macro tile bounds for the specified rect
179 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
180 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
181 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
182 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
183
184 if (pDesc->fullTilesOnly == false)
185 {
186 // include partial tiles
187 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
188 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
189 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
190 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
191 }
192
193 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
194 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
195
196 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
197 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
198
199 // load tiles
200 BE_WORK work;
201 work.type = DISCARDINVALIDATETILES;
202 work.pfnWork = ProcessDiscardInvalidateTilesBE;
203 work.desc.discardInvalidateTiles = *pDesc;
204
205 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
206 {
207 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
208 {
209 pTileMgr->enqueue(x, y, &work);
210 }
211 }
212
213 AR_END(FEProcessInvalidateTiles, 0);
214 }
215
216 //////////////////////////////////////////////////////////////////////////
217 /// @brief Computes the number of primitives given the number of verts.
218 /// @param mode - primitive topology for draw operation.
219 /// @param numPrims - number of vertices or indices for draw.
220 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
221 uint32_t GetNumPrims(
222 PRIMITIVE_TOPOLOGY mode,
223 uint32_t numPrims)
224 {
225 switch (mode)
226 {
227 case TOP_POINT_LIST: return numPrims;
228 case TOP_TRIANGLE_LIST: return numPrims / 3;
229 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
230 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
231 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
232 case TOP_QUAD_LIST: return numPrims / 4;
233 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
234 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
235 case TOP_LINE_LIST: return numPrims / 2;
236 case TOP_LINE_LOOP: return numPrims;
237 case TOP_RECT_LIST: return numPrims / 3;
238 case TOP_LINE_LIST_ADJ: return numPrims / 4;
239 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
240 case TOP_TRI_LIST_ADJ: return numPrims / 6;
241 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
242
243 case TOP_PATCHLIST_1:
244 case TOP_PATCHLIST_2:
245 case TOP_PATCHLIST_3:
246 case TOP_PATCHLIST_4:
247 case TOP_PATCHLIST_5:
248 case TOP_PATCHLIST_6:
249 case TOP_PATCHLIST_7:
250 case TOP_PATCHLIST_8:
251 case TOP_PATCHLIST_9:
252 case TOP_PATCHLIST_10:
253 case TOP_PATCHLIST_11:
254 case TOP_PATCHLIST_12:
255 case TOP_PATCHLIST_13:
256 case TOP_PATCHLIST_14:
257 case TOP_PATCHLIST_15:
258 case TOP_PATCHLIST_16:
259 case TOP_PATCHLIST_17:
260 case TOP_PATCHLIST_18:
261 case TOP_PATCHLIST_19:
262 case TOP_PATCHLIST_20:
263 case TOP_PATCHLIST_21:
264 case TOP_PATCHLIST_22:
265 case TOP_PATCHLIST_23:
266 case TOP_PATCHLIST_24:
267 case TOP_PATCHLIST_25:
268 case TOP_PATCHLIST_26:
269 case TOP_PATCHLIST_27:
270 case TOP_PATCHLIST_28:
271 case TOP_PATCHLIST_29:
272 case TOP_PATCHLIST_30:
273 case TOP_PATCHLIST_31:
274 case TOP_PATCHLIST_32:
275 return numPrims / (mode - TOP_PATCHLIST_BASE);
276
277 case TOP_POLYGON:
278 case TOP_POINT_LIST_BF:
279 case TOP_LINE_STRIP_CONT:
280 case TOP_LINE_STRIP_BF:
281 case TOP_LINE_STRIP_CONT_BF:
282 case TOP_TRIANGLE_FAN_NOSTIPPLE:
283 case TOP_TRI_STRIP_REVERSE:
284 case TOP_PATCHLIST_BASE:
285 case TOP_UNKNOWN:
286 SWR_ASSERT(false, "Unsupported topology: %d", mode);
287 return 0;
288 }
289
290 return 0;
291 }
292
293 //////////////////////////////////////////////////////////////////////////
294 /// @brief Computes the number of verts given the number of primitives.
295 /// @param mode - primitive topology for draw operation.
296 /// @param numPrims - number of primitives for draw.
297 uint32_t GetNumVerts(
298 PRIMITIVE_TOPOLOGY mode,
299 uint32_t numPrims)
300 {
301 switch (mode)
302 {
303 case TOP_POINT_LIST: return numPrims;
304 case TOP_TRIANGLE_LIST: return numPrims * 3;
305 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
306 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
307 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
308 case TOP_QUAD_LIST: return numPrims * 4;
309 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
310 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
311 case TOP_LINE_LIST: return numPrims * 2;
312 case TOP_LINE_LOOP: return numPrims;
313 case TOP_RECT_LIST: return numPrims * 3;
314 case TOP_LINE_LIST_ADJ: return numPrims * 4;
315 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
316 case TOP_TRI_LIST_ADJ: return numPrims * 6;
317 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
318
319 case TOP_PATCHLIST_1:
320 case TOP_PATCHLIST_2:
321 case TOP_PATCHLIST_3:
322 case TOP_PATCHLIST_4:
323 case TOP_PATCHLIST_5:
324 case TOP_PATCHLIST_6:
325 case TOP_PATCHLIST_7:
326 case TOP_PATCHLIST_8:
327 case TOP_PATCHLIST_9:
328 case TOP_PATCHLIST_10:
329 case TOP_PATCHLIST_11:
330 case TOP_PATCHLIST_12:
331 case TOP_PATCHLIST_13:
332 case TOP_PATCHLIST_14:
333 case TOP_PATCHLIST_15:
334 case TOP_PATCHLIST_16:
335 case TOP_PATCHLIST_17:
336 case TOP_PATCHLIST_18:
337 case TOP_PATCHLIST_19:
338 case TOP_PATCHLIST_20:
339 case TOP_PATCHLIST_21:
340 case TOP_PATCHLIST_22:
341 case TOP_PATCHLIST_23:
342 case TOP_PATCHLIST_24:
343 case TOP_PATCHLIST_25:
344 case TOP_PATCHLIST_26:
345 case TOP_PATCHLIST_27:
346 case TOP_PATCHLIST_28:
347 case TOP_PATCHLIST_29:
348 case TOP_PATCHLIST_30:
349 case TOP_PATCHLIST_31:
350 case TOP_PATCHLIST_32:
351 return numPrims * (mode - TOP_PATCHLIST_BASE);
352
353 case TOP_POLYGON:
354 case TOP_POINT_LIST_BF:
355 case TOP_LINE_STRIP_CONT:
356 case TOP_LINE_STRIP_BF:
357 case TOP_LINE_STRIP_CONT_BF:
358 case TOP_TRIANGLE_FAN_NOSTIPPLE:
359 case TOP_TRI_STRIP_REVERSE:
360 case TOP_PATCHLIST_BASE:
361 case TOP_UNKNOWN:
362 SWR_ASSERT(false, "Unsupported topology: %d", mode);
363 return 0;
364 }
365
366 return 0;
367 }
368
369 //////////////////////////////////////////////////////////////////////////
370 /// @brief Return number of verts per primitive.
371 /// @param topology - topology
372 /// @param includeAdjVerts - include adjacent verts in primitive vertices
373 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
374 {
375 uint32_t numVerts = 0;
376 switch (topology)
377 {
378 case TOP_POINT_LIST:
379 case TOP_POINT_LIST_BF:
380 numVerts = 1;
381 break;
382 case TOP_LINE_LIST:
383 case TOP_LINE_STRIP:
384 case TOP_LINE_LIST_ADJ:
385 case TOP_LINE_LOOP:
386 case TOP_LINE_STRIP_CONT:
387 case TOP_LINE_STRIP_BF:
388 case TOP_LISTSTRIP_ADJ:
389 numVerts = 2;
390 break;
391 case TOP_TRIANGLE_LIST:
392 case TOP_TRIANGLE_STRIP:
393 case TOP_TRIANGLE_FAN:
394 case TOP_TRI_LIST_ADJ:
395 case TOP_TRI_STRIP_ADJ:
396 case TOP_TRI_STRIP_REVERSE:
397 case TOP_RECT_LIST:
398 numVerts = 3;
399 break;
400 case TOP_QUAD_LIST:
401 case TOP_QUAD_STRIP:
402 numVerts = 4;
403 break;
404 case TOP_PATCHLIST_1:
405 case TOP_PATCHLIST_2:
406 case TOP_PATCHLIST_3:
407 case TOP_PATCHLIST_4:
408 case TOP_PATCHLIST_5:
409 case TOP_PATCHLIST_6:
410 case TOP_PATCHLIST_7:
411 case TOP_PATCHLIST_8:
412 case TOP_PATCHLIST_9:
413 case TOP_PATCHLIST_10:
414 case TOP_PATCHLIST_11:
415 case TOP_PATCHLIST_12:
416 case TOP_PATCHLIST_13:
417 case TOP_PATCHLIST_14:
418 case TOP_PATCHLIST_15:
419 case TOP_PATCHLIST_16:
420 case TOP_PATCHLIST_17:
421 case TOP_PATCHLIST_18:
422 case TOP_PATCHLIST_19:
423 case TOP_PATCHLIST_20:
424 case TOP_PATCHLIST_21:
425 case TOP_PATCHLIST_22:
426 case TOP_PATCHLIST_23:
427 case TOP_PATCHLIST_24:
428 case TOP_PATCHLIST_25:
429 case TOP_PATCHLIST_26:
430 case TOP_PATCHLIST_27:
431 case TOP_PATCHLIST_28:
432 case TOP_PATCHLIST_29:
433 case TOP_PATCHLIST_30:
434 case TOP_PATCHLIST_31:
435 case TOP_PATCHLIST_32:
436 numVerts = topology - TOP_PATCHLIST_BASE;
437 break;
438 default:
439 SWR_ASSERT(false, "Unsupported topology: %d", topology);
440 break;
441 }
442
443 if (includeAdjVerts)
444 {
445 switch (topology)
446 {
447 case TOP_LISTSTRIP_ADJ:
448 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
449 case TOP_TRI_STRIP_ADJ:
450 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
451 default: break;
452 }
453 }
454
455 return numVerts;
456 }
457
458 //////////////////////////////////////////////////////////////////////////
459 /// @brief Generate mask from remaining work.
460 /// @param numWorkItems - Number of items being worked on by a SIMD.
461 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
462 {
463 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
464 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
465 return _simd_castps_si(vMask(mask));
466 }
467
468
469 //////////////////////////////////////////////////////////////////////////
470 /// @brief Gather scissor rect data based on per-prim viewport indices.
471 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
472 /// @param pViewportIndex - array of per-primitive vewport indexes.
473 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
474 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
475 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
476 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
477 //
478 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
479 template<size_t SimdWidth>
480 struct GatherScissors
481 {
482 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
483 simdscalari &scisXmin, simdscalari &scisYmin,
484 simdscalari &scisXmax, simdscalari &scisYmax)
485 {
486 SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
487 }
488 };
489
490 template<>
491 struct GatherScissors<8>
492 {
493 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
494 simdscalari &scisXmin, simdscalari &scisYmin,
495 simdscalari &scisXmax, simdscalari &scisYmax)
496 {
497 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
498 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
499 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
500 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
501 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
502 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
503 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
504 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
505 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
506 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
507 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
508 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
509 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
510 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
511 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
512 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
513 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
514 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
515 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
516 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
517 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
518 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
519 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
520 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
521 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
522 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
523 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
524 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
525 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
526 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
527 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
528 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
529 }
530 };
531
532 //////////////////////////////////////////////////////////////////////////
533 /// @brief StreamOut - Streams vertex data out to SO buffers.
534 /// Generally, we are only streaming out a SIMDs worth of triangles.
535 /// @param pDC - pointer to draw context.
536 /// @param workerId - thread's worker id. Even thread has a unique id.
537 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
538 static void StreamOut(
539 DRAW_CONTEXT* pDC,
540 PA_STATE& pa,
541 uint32_t workerId,
542 uint32_t* pPrimData,
543 uint32_t streamIndex)
544 {
545 SWR_CONTEXT *pContext = pDC->pContext;
546
547 AR_BEGIN(FEStreamout, pDC->drawId);
548
549 const API_STATE& state = GetApiState(pDC);
550 const SWR_STREAMOUT_STATE &soState = state.soState;
551
552 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
553
554 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
555 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
556
557 SWR_STREAMOUT_CONTEXT soContext = { 0 };
558
559 // Setup buffer state pointers.
560 for (uint32_t i = 0; i < 4; ++i)
561 {
562 soContext.pBuffer[i] = &state.soBuffer[i];
563 }
564
565 uint32_t numPrims = pa.NumPrims();
566 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
567 {
568 DWORD slot = 0;
569 uint32_t soMask = soState.streamMasks[streamIndex];
570
571 // Write all entries into primitive data buffer for SOS.
572 while (_BitScanForward(&slot, soMask))
573 {
574 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
575 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
576 pa.AssembleSingle(paSlot, primIndex, attrib);
577
578 // Attribute offset is relative offset from start of vertex.
579 // Note that attributes start at slot 1 in the PA buffer. We need to write this
580 // to prim data starting at slot 0. Which is why we do (slot - 1).
581 // Also note: GL works slightly differently, and needs slot 0
582 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
583
584 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
585 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
586 {
587 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
588
589 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
590 }
591 soMask &= ~(1 << slot);
592 }
593
594 // Update pPrimData pointer
595 soContext.pPrimData = pPrimData;
596
597 // Call SOS
598 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
599 state.pfnSoFunc[streamIndex](soContext);
600 }
601
602 // Update SO write offset. The driver provides memory for the update.
603 for (uint32_t i = 0; i < 4; ++i)
604 {
605 if (state.soBuffer[i].pWriteOffset)
606 {
607 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
608 }
609
610 if (state.soBuffer[i].soWriteEnable)
611 {
612 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
613 pDC->dynState.SoWriteOffsetDirty[i] = true;
614 }
615 }
616
617 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
618 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
619
620 AR_END(FEStreamout, 1);
621 }
622
623 //////////////////////////////////////////////////////////////////////////
624 /// @brief Computes number of invocations. The current index represents
625 /// the start of the SIMD. The max index represents how much work
626 /// items are remaining. If there is less then a SIMD's xmin of work
627 /// then return the remaining amount of work.
628 /// @param curIndex - The start index for the SIMD.
629 /// @param maxIndex - The last index for all work items.
630 static INLINE uint32_t GetNumInvocations(
631 uint32_t curIndex,
632 uint32_t maxIndex)
633 {
634 uint32_t remainder = (maxIndex - curIndex);
635 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
636 }
637
638 //////////////////////////////////////////////////////////////////////////
639 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
640 /// The geometry shader will loop over each active streamout buffer, assembling
641 /// primitives for the downstream stages. When multistream output is enabled,
642 /// the generated stream ID buffer from the GS needs to be converted to a cut
643 /// buffer for the primitive assembler.
644 /// @param stream - stream id to generate the cut buffer for
645 /// @param pStreamIdBase - pointer to the stream ID buffer
646 /// @param numEmittedVerts - Number of total verts emitted by the GS
647 /// @param pCutBuffer - output buffer to write cuts to
648 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
649 {
650 SWR_ASSERT(stream < MAX_SO_STREAMS);
651
652 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
653 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
654
655 for (uint32_t b = 0; b < numOutputBytes; ++b)
656 {
657 uint8_t curInputByte = pStreamIdBase[2*b];
658 uint8_t outByte = 0;
659 for (uint32_t i = 0; i < 4; ++i)
660 {
661 if ((curInputByte & 0x3) != stream)
662 {
663 outByte |= (1 << i);
664 }
665 curInputByte >>= 2;
666 }
667
668 curInputByte = pStreamIdBase[2 * b + 1];
669 for (uint32_t i = 0; i < 4; ++i)
670 {
671 if ((curInputByte & 0x3) != stream)
672 {
673 outByte |= (1 << (i + 4));
674 }
675 curInputByte >>= 2;
676 }
677
678 *pCutBuffer++ = outByte;
679 }
680 }
681
682 THREAD SWR_GS_CONTEXT tlsGsContext;
683
684 //////////////////////////////////////////////////////////////////////////
685 /// @brief Implements GS stage.
686 /// @param pDC - pointer to draw context.
687 /// @param workerId - thread's worker id. Even thread has a unique id.
688 /// @param pa - The primitive assembly object.
689 /// @param pGsOut - output stream for GS
690 template <
691 typename HasStreamOutT,
692 typename HasRastT>
693 static void GeometryShaderStage(
694 DRAW_CONTEXT *pDC,
695 uint32_t workerId,
696 PA_STATE& pa,
697 void* pGsOut,
698 void* pCutBuffer,
699 void* pStreamCutBuffer,
700 uint32_t* pSoPrimData,
701 simdscalari primID)
702 {
703 SWR_CONTEXT *pContext = pDC->pContext;
704
705 AR_BEGIN(FEGeometryShader, pDC->drawId);
706
707 const API_STATE& state = GetApiState(pDC);
708 const SWR_GS_STATE* pState = &state.gsState;
709
710 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
711 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
712
713 tlsGsContext.pStream = (uint8_t*)pGsOut;
714 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
715 tlsGsContext.PrimitiveID = primID;
716
717 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
718 simdvector attrib[MAX_ATTRIBUTES];
719
720 // assemble all attributes for the input primitive
721 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
722 {
723 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
724 pa.Assemble(attribSlot, attrib);
725
726 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
727 {
728 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
729 }
730 }
731
732 // assemble position
733 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
734 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
735 {
736 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
737 }
738
739 const uint32_t vertexStride = sizeof(simdvertex);
740 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
741 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
742 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
743 uint32_t cutPrimStride;
744 uint32_t cutInstanceStride;
745
746 if (pState->isSingleStream)
747 {
748 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
749 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
750 }
751 else
752 {
753 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
754 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
755 }
756
757 // record valid prims from the frontend to avoid over binning the newly generated
758 // prims from the GS
759 uint32_t numInputPrims = pa.NumPrims();
760
761 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
762 {
763 tlsGsContext.InstanceID = instance;
764 tlsGsContext.mask = GenerateMask(numInputPrims);
765
766 // execute the geometry shader
767 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
768
769 tlsGsContext.pStream += instanceStride;
770 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
771 }
772
773 // set up new binner and state for the GS output topology
774 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
775 if (HasRastT::value)
776 {
777 switch (pState->outputTopology)
778 {
779 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
780 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
781 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
782 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
783 }
784 }
785
786 // foreach input prim:
787 // - setup a new PA based on the emitted verts for that prim
788 // - loop over the new verts, calling PA to assemble each prim
789 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
790 uint32_t* pPrimitiveId = (uint32_t*)&primID;
791
792 uint32_t totalPrimsGenerated = 0;
793 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
794 {
795 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
796 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
797 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
798 {
799 uint32_t numEmittedVerts = pVertexCount[inputPrim];
800 if (numEmittedVerts == 0)
801 {
802 continue;
803 }
804
805 uint8_t* pBase = pInstanceBase + instance * instanceStride;
806 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
807
808 uint32_t numAttribs = state.feNumAttributes;
809
810 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
811 {
812 bool processCutVerts = false;
813
814 uint8_t* pCutBuffer = pCutBase;
815
816 // assign default stream ID, only relevant when GS is outputting a single stream
817 uint32_t streamID = 0;
818 if (pState->isSingleStream)
819 {
820 processCutVerts = true;
821 streamID = pState->singleStreamID;
822 if (streamID != stream) continue;
823 }
824 else
825 {
826 // early exit if this stream is not enabled for streamout
827 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
828 {
829 continue;
830 }
831
832 // multi-stream output, need to translate StreamID buffer to a cut buffer
833 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
834 pCutBuffer = (uint8_t*)pStreamCutBuffer;
835 processCutVerts = false;
836 }
837
838 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
839
840 while (gsPa.GetNextStreamOutput())
841 {
842 do
843 {
844 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
845
846 if (assemble)
847 {
848 totalPrimsGenerated += gsPa.NumPrims();
849
850 if (HasStreamOutT::value)
851 {
852 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
853 }
854
855 if (HasRastT::value && state.soState.streamToRasterizer == stream)
856 {
857 simdscalari vPrimId;
858 // pull primitiveID from the GS output if available
859 if (state.gsState.emitsPrimitiveID)
860 {
861 simdvector primIdAttrib[3];
862 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
863 vPrimId = _simd_castps_si(primIdAttrib[0].x);
864 }
865 else
866 {
867 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
868 }
869
870 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
871 simdscalari vViewPortIdx;
872 if (state.gsState.emitsViewportArrayIndex)
873 {
874 simdvector vpiAttrib[3];
875 gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
876
877 // OOB indices => forced to zero.
878 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
879 simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
880 vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
881
882 vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
883 }
884 else
885 {
886 vViewPortIdx = _simd_set1_epi32(0);
887 }
888
889 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
890 }
891 }
892 } while (gsPa.NextPrim());
893 }
894 }
895 }
896 }
897
898 // update GS pipeline stats
899 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
900 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
901
902 AR_END(FEGeometryShader, 1);
903 }
904
905 //////////////////////////////////////////////////////////////////////////
906 /// @brief Allocate GS buffers
907 /// @param pDC - pointer to draw context.
908 /// @param state - API state
909 /// @param ppGsOut - pointer to GS output buffer allocation
910 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
911 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
912 void **ppStreamCutBuffer)
913 {
914 auto pArena = pDC->pArena;
915 SWR_ASSERT(pArena != nullptr);
916 SWR_ASSERT(state.gsState.gsEnable);
917 // allocate arena space to hold GS output verts
918 // @todo pack attribs
919 // @todo support multiple streams
920 const uint32_t vertexStride = sizeof(simdvertex);
921 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
922 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
923 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
924
925 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
926 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
927 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
928 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
929
930 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
931 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
932
933 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
934 if (state.gsState.isSingleStream)
935 {
936 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
937 *ppStreamCutBuffer = nullptr;
938 }
939 else
940 {
941 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
942 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
943 }
944
945 }
946
947 //////////////////////////////////////////////////////////////////////////
948 /// @brief Contains all data generated by the HS and passed to the
949 /// tessellator and DS.
950 struct TessellationThreadLocalData
951 {
952 SWR_HS_CONTEXT hsContext;
953 ScalarPatch patchData[KNOB_SIMD_WIDTH];
954 void* pTxCtx;
955 size_t tsCtxSize;
956
957 simdscalar* pDSOutput;
958 size_t numDSOutputVectors;
959 };
960
961 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
962
963 //////////////////////////////////////////////////////////////////////////
964 /// @brief Allocate tessellation data for this worker thread.
965 INLINE
966 static void AllocateTessellationData(SWR_CONTEXT* pContext)
967 {
968 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
969 if (gt_pTessellationThreadData == nullptr)
970 {
971 gt_pTessellationThreadData = (TessellationThreadLocalData*)
972 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
973 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
974 }
975 }
976
977 //////////////////////////////////////////////////////////////////////////
978 /// @brief Implements Tessellation Stages.
979 /// @param pDC - pointer to draw context.
980 /// @param workerId - thread's worker id. Even thread has a unique id.
981 /// @param pa - The primitive assembly object.
982 /// @param pGsOut - output stream for GS
983 template <
984 typename HasGeometryShaderT,
985 typename HasStreamOutT,
986 typename HasRastT>
987 static void TessellationStages(
988 DRAW_CONTEXT *pDC,
989 uint32_t workerId,
990 PA_STATE& pa,
991 void* pGsOut,
992 void* pCutBuffer,
993 void* pCutStreamBuffer,
994 uint32_t* pSoPrimData,
995 simdscalari primID)
996 {
997 SWR_CONTEXT *pContext = pDC->pContext;
998 const API_STATE& state = GetApiState(pDC);
999 const SWR_TS_STATE& tsState = state.tsState;
1000
1001 SWR_ASSERT(gt_pTessellationThreadData);
1002
1003 HANDLE tsCtx = TSInitCtx(
1004 tsState.domain,
1005 tsState.partitioning,
1006 tsState.tsOutputTopology,
1007 gt_pTessellationThreadData->pTxCtx,
1008 gt_pTessellationThreadData->tsCtxSize);
1009 if (tsCtx == nullptr)
1010 {
1011 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1012 tsCtx = TSInitCtx(
1013 tsState.domain,
1014 tsState.partitioning,
1015 tsState.tsOutputTopology,
1016 gt_pTessellationThreadData->pTxCtx,
1017 gt_pTessellationThreadData->tsCtxSize);
1018 }
1019 SWR_ASSERT(tsCtx);
1020
1021 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1022 if (HasRastT::value)
1023 {
1024 switch (tsState.postDSTopology)
1025 {
1026 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1027 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1028 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1029 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1030 }
1031 }
1032
1033 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1034 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1035 hsContext.PrimitiveID = primID;
1036
1037 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1038 // Max storage for one attribute for an entire simdprimitive
1039 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1040
1041 // assemble all attributes for the input primitives
1042 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1043 {
1044 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1045 pa.Assemble(attribSlot, simdattrib);
1046
1047 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1048 {
1049 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1050 }
1051 }
1052
1053 #if defined(_DEBUG)
1054 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1055 #endif
1056
1057 uint32_t numPrims = pa.NumPrims();
1058 hsContext.mask = GenerateMask(numPrims);
1059
1060 // Run the HS
1061 AR_BEGIN(FEHullShader, pDC->drawId);
1062 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1063 AR_END(FEHullShader, 0);
1064
1065 UPDATE_STAT_FE(HsInvocations, numPrims);
1066
1067 const uint32_t* pPrimId = (const uint32_t*)&primID;
1068
1069 for (uint32_t p = 0; p < numPrims; ++p)
1070 {
1071 // Run Tessellator
1072 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1073 AR_BEGIN(FETessellation, pDC->drawId);
1074 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1075 AR_END(FETessellation, 0);
1076
1077 if (tsData.NumPrimitives == 0)
1078 {
1079 continue;
1080 }
1081 SWR_ASSERT(tsData.NumDomainPoints);
1082
1083 // Allocate DS Output memory
1084 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1085 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1086 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1087 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1088 {
1089 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1090 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1091 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1092 }
1093 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1094 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1095
1096 #if defined(_DEBUG)
1097 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1098 #endif
1099
1100 // Run Domain Shader
1101 SWR_DS_CONTEXT dsContext;
1102 dsContext.PrimitiveID = pPrimId[p];
1103 dsContext.pCpIn = &hsContext.pCPout[p];
1104 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1105 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1106 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1107 dsContext.vectorStride = requiredDSVectorInvocations;
1108
1109 uint32_t dsInvocations = 0;
1110
1111 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1112 {
1113 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1114
1115 AR_BEGIN(FEDomainShader, pDC->drawId);
1116 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1117 AR_END(FEDomainShader, 0);
1118
1119 dsInvocations += KNOB_SIMD_WIDTH;
1120 }
1121 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1122
1123 PA_TESS tessPa(
1124 pDC,
1125 dsContext.pOutputData,
1126 dsContext.vectorStride,
1127 tsState.numDsOutputAttribs,
1128 tsData.ppIndices,
1129 tsData.NumPrimitives,
1130 tsState.postDSTopology);
1131
1132 while (tessPa.HasWork())
1133 {
1134 if (HasGeometryShaderT::value)
1135 {
1136 GeometryShaderStage<HasStreamOutT, HasRastT>(
1137 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1138 _simd_set1_epi32(dsContext.PrimitiveID));
1139 }
1140 else
1141 {
1142 if (HasStreamOutT::value)
1143 {
1144 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1145 }
1146
1147 if (HasRastT::value)
1148 {
1149 simdvector prim[3]; // Only deal with triangles, lines, or points
1150 AR_BEGIN(FEPAAssemble, pDC->drawId);
1151 #if SWR_ENABLE_ASSERTS
1152 bool assemble =
1153 #endif
1154 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1155 AR_END(FEPAAssemble, 1);
1156 SWR_ASSERT(assemble);
1157
1158 SWR_ASSERT(pfnClipFunc);
1159 pfnClipFunc(pDC, tessPa, workerId, prim,
1160 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1161 }
1162 }
1163
1164 tessPa.NextPrim();
1165
1166 } // while (tessPa.HasWork())
1167 } // for (uint32_t p = 0; p < numPrims; ++p)
1168
1169 TSDestroyCtx(tsCtx);
1170 }
1171
1172 //////////////////////////////////////////////////////////////////////////
1173 /// @brief FE handler for SwrDraw.
1174 /// @tparam IsIndexedT - Is indexed drawing enabled
1175 /// @tparam HasTessellationT - Is tessellation enabled
1176 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1177 /// @tparam HasStreamOutT - Is stream-out enabled
1178 /// @tparam HasRastT - Is rasterization enabled
1179 /// @param pContext - pointer to SWR context.
1180 /// @param pDC - pointer to draw context.
1181 /// @param workerId - thread's worker id.
1182 /// @param pUserData - Pointer to DRAW_WORK
1183 template <
1184 typename IsIndexedT,
1185 typename IsCutIndexEnabledT,
1186 typename HasTessellationT,
1187 typename HasGeometryShaderT,
1188 typename HasStreamOutT,
1189 typename HasRastT>
1190 void ProcessDraw(
1191 SWR_CONTEXT *pContext,
1192 DRAW_CONTEXT *pDC,
1193 uint32_t workerId,
1194 void *pUserData)
1195 {
1196
1197 #if KNOB_ENABLE_TOSS_POINTS
1198 if (KNOB_TOSS_QUEUE_FE)
1199 {
1200 return;
1201 }
1202 #endif
1203
1204 AR_BEGIN(FEProcessDraw, pDC->drawId);
1205
1206 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1207 const API_STATE& state = GetApiState(pDC);
1208 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1209 SWR_VS_CONTEXT vsContext;
1210 simdvertex vin;
1211
1212 int indexSize = 0;
1213 uint32_t endVertex = work.numVerts;
1214
1215 const int32_t* pLastRequestedIndex = nullptr;
1216 if (IsIndexedT::value)
1217 {
1218 switch (work.type)
1219 {
1220 case R32_UINT:
1221 indexSize = sizeof(uint32_t);
1222 pLastRequestedIndex = &(work.pIB[endVertex]);
1223 break;
1224 case R16_UINT:
1225 indexSize = sizeof(uint16_t);
1226 // nasty address offset to last index
1227 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1228 break;
1229 case R8_UINT:
1230 indexSize = sizeof(uint8_t);
1231 // nasty address offset to last index
1232 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1233 break;
1234 default:
1235 SWR_ASSERT(0);
1236 }
1237 }
1238 else
1239 {
1240 // No cuts, prune partial primitives.
1241 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1242 }
1243
1244 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1245 fetchInfo.pStreams = &state.vertexBuffers[0];
1246 fetchInfo.StartInstance = work.startInstance;
1247 fetchInfo.StartVertex = 0;
1248
1249 vsContext.pVin = &vin;
1250
1251 if (IsIndexedT::value)
1252 {
1253 fetchInfo.BaseVertex = work.baseVertex;
1254
1255 // if the entire index buffer isn't being consumed, set the last index
1256 // so that fetches < a SIMD wide will be masked off
1257 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1258 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1259 {
1260 fetchInfo.pLastIndex = pLastRequestedIndex;
1261 }
1262 }
1263 else
1264 {
1265 fetchInfo.StartVertex = work.startVertex;
1266 }
1267
1268 #ifdef KNOB_ENABLE_RDTSC
1269 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1270 #endif
1271
1272 void* pGsOut = nullptr;
1273 void* pCutBuffer = nullptr;
1274 void* pStreamCutBuffer = nullptr;
1275 if (HasGeometryShaderT::value)
1276 {
1277 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1278 }
1279
1280 if (HasTessellationT::value)
1281 {
1282 SWR_ASSERT(state.tsState.tsEnable == true);
1283 SWR_ASSERT(state.pfnHsFunc != nullptr);
1284 SWR_ASSERT(state.pfnDsFunc != nullptr);
1285
1286 AllocateTessellationData(pContext);
1287 }
1288 else
1289 {
1290 SWR_ASSERT(state.tsState.tsEnable == false);
1291 SWR_ASSERT(state.pfnHsFunc == nullptr);
1292 SWR_ASSERT(state.pfnDsFunc == nullptr);
1293 }
1294
1295 // allocate space for streamout input prim data
1296 uint32_t* pSoPrimData = nullptr;
1297 if (HasStreamOutT::value)
1298 {
1299 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1300 }
1301
1302 // choose primitive assembler
1303 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1304 PA_STATE& pa = paFactory.GetPA();
1305
1306 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1307 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1308 {
1309 simdscalari vIndex;
1310 uint32_t i = 0;
1311
1312 if (IsIndexedT::value)
1313 {
1314 fetchInfo.pIndices = work.pIB;
1315 }
1316 else
1317 {
1318 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1319 fetchInfo.pIndices = (const int32_t*)&vIndex;
1320 }
1321
1322 fetchInfo.CurInstance = instanceNum;
1323 vsContext.InstanceID = instanceNum;
1324
1325 while (pa.HasWork())
1326 {
1327 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1328 // So we need to keep this outside of (i < endVertex) check.
1329 simdmask* pvCutIndices = nullptr;
1330 if (IsIndexedT::value)
1331 {
1332 pvCutIndices = &pa.GetNextVsIndices();
1333 }
1334
1335 simdvertex& vout = pa.GetNextVsOutput();
1336 vsContext.pVout = &vout;
1337
1338 if (i < endVertex)
1339 {
1340
1341 // 1. Execute FS/VS for a single SIMD.
1342 AR_BEGIN(FEFetchShader, pDC->drawId);
1343 state.pfnFetchFunc(fetchInfo, vin);
1344 AR_END(FEFetchShader, 0);
1345
1346 // forward fetch generated vertex IDs to the vertex shader
1347 vsContext.VertexID = fetchInfo.VertexID;
1348
1349 // Setup active mask for vertex shader.
1350 vsContext.mask = GenerateMask(endVertex - i);
1351
1352 // forward cut mask to the PA
1353 if (IsIndexedT::value)
1354 {
1355 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1356 }
1357
1358 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1359
1360 #if KNOB_ENABLE_TOSS_POINTS
1361 if (!KNOB_TOSS_FETCH)
1362 #endif
1363 {
1364 AR_BEGIN(FEVertexShader, pDC->drawId);
1365 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1366 AR_END(FEVertexShader, 0);
1367
1368 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1369 }
1370 }
1371
1372 // 2. Assemble primitives given the last two SIMD.
1373 do
1374 {
1375 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1376 // PaAssemble returns false if there is not enough verts to assemble.
1377 AR_BEGIN(FEPAAssemble, pDC->drawId);
1378 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1379 AR_END(FEPAAssemble, 1);
1380
1381 #if KNOB_ENABLE_TOSS_POINTS
1382 if (!KNOB_TOSS_FETCH)
1383 #endif
1384 {
1385 #if KNOB_ENABLE_TOSS_POINTS
1386 if (!KNOB_TOSS_VS)
1387 #endif
1388 {
1389 if (assemble)
1390 {
1391 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1392
1393 if (HasTessellationT::value)
1394 {
1395 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1396 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1397 }
1398 else if (HasGeometryShaderT::value)
1399 {
1400 GeometryShaderStage<HasStreamOutT, HasRastT>(
1401 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1402 }
1403 else
1404 {
1405 // If streamout is enabled then stream vertices out to memory.
1406 if (HasStreamOutT::value)
1407 {
1408 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1409 }
1410
1411 if (HasRastT::value)
1412 {
1413 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1414 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1415 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1416 }
1417 }
1418 }
1419 }
1420 }
1421 } while (pa.NextPrim());
1422
1423 i += KNOB_SIMD_WIDTH;
1424 if (IsIndexedT::value)
1425 {
1426 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1427 }
1428 else
1429 {
1430 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1431 }
1432 }
1433 pa.Reset();
1434 }
1435
1436 AR_END(FEProcessDraw, numPrims * work.numInstances);
1437 }
1438
1439 struct FEDrawChooser
1440 {
1441 typedef PFN_FE_WORK_FUNC FuncType;
1442
1443 template <typename... ArgsB>
1444 static FuncType GetFunc()
1445 {
1446 return ProcessDraw<ArgsB...>;
1447 }
1448 };
1449
1450
1451 // Selector for correct templated Draw front-end function
1452 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1453 bool IsIndexed,
1454 bool IsCutIndexEnabled,
1455 bool HasTessellation,
1456 bool HasGeometryShader,
1457 bool HasStreamOut,
1458 bool HasRasterization)
1459 {
1460 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1461 }
1462
1463 //////////////////////////////////////////////////////////////////////////
1464 /// @brief Processes attributes for the backend based on linkage mask and
1465 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1466 /// @param pDC - Draw context
1467 /// @param pa - Primitive Assembly state
1468 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1469 /// @param pLinkageMap - maps VS attribute slot to PS slot
1470 /// @param triIndex - Triangle to process attributes for
1471 /// @param pBuffer - Output result
1472 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1473 INLINE void ProcessAttributes(
1474 DRAW_CONTEXT *pDC,
1475 PA_STATE&pa,
1476 uint32_t triIndex,
1477 uint32_t primId,
1478 float *pBuffer)
1479 {
1480 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1481 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1482 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1483 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1484 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1485 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1486
1487 static const float constTable[3][4] = {
1488 {0.0f, 0.0f, 0.0f, 0.0f},
1489 {0.0f, 0.0f, 0.0f, 1.0f},
1490 {1.0f, 1.0f, 1.0f, 1.0f}
1491 };
1492
1493 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1494 {
1495 uint32_t inputSlot;
1496 if (IsSwizzledT::value)
1497 {
1498 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1499 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1500
1501 }
1502 else
1503 {
1504 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1505 }
1506
1507 __m128 attrib[3]; // triangle attribs (always 4 wide)
1508 float* pAttribStart = pBuffer;
1509
1510 if (HasConstantInterpT::value || IsDegenerate::value)
1511 {
1512 if (_bittest(&constantInterpMask, i))
1513 {
1514 uint32_t vid;
1515 uint32_t adjustedTriIndex;
1516 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1517 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1518 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1519 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1520 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1521
1522 switch (topo) {
1523 case TOP_QUAD_LIST:
1524 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1525 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1526 break;
1527 case TOP_QUAD_STRIP:
1528 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1529 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1530 break;
1531 case TOP_TRIANGLE_STRIP:
1532 adjustedTriIndex = triIndex;
1533 vid = (triIndex & 1)
1534 ? tristripProvokingVertex[provokingVertex]
1535 : provokingVertex;
1536 break;
1537 default:
1538 adjustedTriIndex = triIndex;
1539 vid = provokingVertex;
1540 break;
1541 }
1542
1543 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1544
1545 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1546 {
1547 _mm_store_ps(pBuffer, attrib[vid]);
1548 pBuffer += 4;
1549 }
1550 }
1551 else
1552 {
1553 pa.AssembleSingle(inputSlot, triIndex, attrib);
1554
1555 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1556 {
1557 _mm_store_ps(pBuffer, attrib[i]);
1558 pBuffer += 4;
1559 }
1560 }
1561 }
1562 else
1563 {
1564 pa.AssembleSingle(inputSlot, triIndex, attrib);
1565
1566 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1567 {
1568 _mm_store_ps(pBuffer, attrib[i]);
1569 pBuffer += 4;
1570 }
1571 }
1572
1573 // pad out the attrib buffer to 3 verts to ensure the triangle
1574 // interpolation code in the pixel shader works correctly for the
1575 // 3 topologies - point, line, tri. This effectively zeros out the
1576 // effect of the missing vertices in the triangle interpolation.
1577 for (uint32_t v = NumVertsT::value; v < 3; ++v)
1578 {
1579 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1580 pBuffer += 4;
1581 }
1582
1583 // check for constant source overrides
1584 if (IsSwizzledT::value)
1585 {
1586 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1587 if (mask)
1588 {
1589 DWORD comp;
1590 while (_BitScanForward(&comp, mask))
1591 {
1592 mask &= ~(1 << comp);
1593
1594 float constantValue = 0.0f;
1595 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1596 {
1597 case SWR_CONSTANT_SOURCE_CONST_0000:
1598 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1599 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1600 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1601 break;
1602 case SWR_CONSTANT_SOURCE_PRIM_ID:
1603 constantValue = *(float*)&primId;
1604 break;
1605 }
1606
1607 // apply constant value to all 3 vertices
1608 for (uint32_t v = 0; v < 3; ++v)
1609 {
1610 pAttribStart[comp + v * 4] = constantValue;
1611 }
1612 }
1613 }
1614 }
1615 }
1616 }
1617
1618
1619 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1620
1621 struct ProcessAttributesChooser
1622 {
1623 typedef PFN_PROCESS_ATTRIBUTES FuncType;
1624
1625 template <typename... ArgsB>
1626 static FuncType GetFunc()
1627 {
1628 return ProcessAttributes<ArgsB...>;
1629 }
1630 };
1631
1632 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1633 {
1634 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1635 }
1636
1637 //////////////////////////////////////////////////////////////////////////
1638 /// @brief Processes enabled user clip distances. Loads the active clip
1639 /// distances from the PA, sets up barycentric equations, and
1640 /// stores the results to the output buffer
1641 /// @param pa - Primitive Assembly state
1642 /// @param primIndex - primitive index to process
1643 /// @param clipDistMask - mask of enabled clip distances
1644 /// @param pUserClipBuffer - buffer to store results
1645 template<uint32_t NumVerts>
1646 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1647 {
1648 DWORD clipDist;
1649 while (_BitScanForward(&clipDist, clipDistMask))
1650 {
1651 clipDistMask &= ~(1 << clipDist);
1652 uint32_t clipSlot = clipDist >> 2;
1653 uint32_t clipComp = clipDist & 0x3;
1654 uint32_t clipAttribSlot = clipSlot == 0 ?
1655 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1656
1657 __m128 primClipDist[3];
1658 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1659
1660 float vertClipDist[NumVerts];
1661 for (uint32_t e = 0; e < NumVerts; ++e)
1662 {
1663 OSALIGNSIMD(float) aVertClipDist[4];
1664 _mm_store_ps(aVertClipDist, primClipDist[e]);
1665 vertClipDist[e] = aVertClipDist[clipComp];
1666 };
1667
1668 // setup plane equations for barycentric interpolation in the backend
1669 float baryCoeff[NumVerts];
1670 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1671 {
1672 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1673 }
1674 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1675
1676 for (uint32_t e = 0; e < NumVerts; ++e)
1677 {
1678 *(pUserClipBuffer++) = baryCoeff[e];
1679 }
1680 }
1681 }
1682
1683 //////////////////////////////////////////////////////////////////////////
1684 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1685 /// Point precision from FP32.
1686 template <typename PT = FixedPointTraits<Fixed_16_8>>
1687 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1688 {
1689 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1690 return _simd_cvtps_epi32(vFixed);
1691 }
1692
1693 //////////////////////////////////////////////////////////////////////////
1694 /// @brief Helper function to set the X,Y coords of a triangle to the
1695 /// requested Fixed Point precision from FP32.
1696 /// @param tri: simdvector[3] of FP triangle verts
1697 /// @param vXi: fixed point X coords of tri verts
1698 /// @param vYi: fixed point Y coords of tri verts
1699 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1700 {
1701 vXi[0] = fpToFixedPointVertical(tri[0].x);
1702 vYi[0] = fpToFixedPointVertical(tri[0].y);
1703 vXi[1] = fpToFixedPointVertical(tri[1].x);
1704 vYi[1] = fpToFixedPointVertical(tri[1].y);
1705 vXi[2] = fpToFixedPointVertical(tri[2].x);
1706 vYi[2] = fpToFixedPointVertical(tri[2].y);
1707 }
1708
1709 //////////////////////////////////////////////////////////////////////////
1710 /// @brief Calculate bounding box for current triangle
1711 /// @tparam CT: ConservativeRastFETraits type
1712 /// @param vX: fixed point X position for triangle verts
1713 /// @param vY: fixed point Y position for triangle verts
1714 /// @param bbox: fixed point bbox
1715 /// *Note*: expects vX, vY to be in the correct precision for the type
1716 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1717 template <typename CT>
1718 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1719 {
1720 simdscalari vMinX = vX[0];
1721 vMinX = _simd_min_epi32(vMinX, vX[1]);
1722 vMinX = _simd_min_epi32(vMinX, vX[2]);
1723
1724 simdscalari vMaxX = vX[0];
1725 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1726 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1727
1728 simdscalari vMinY = vY[0];
1729 vMinY = _simd_min_epi32(vMinY, vY[1]);
1730 vMinY = _simd_min_epi32(vMinY, vY[2]);
1731
1732 simdscalari vMaxY = vY[0];
1733 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1734 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1735
1736 bbox.xmin = vMinX;
1737 bbox.xmax = vMaxX;
1738 bbox.ymin = vMinY;
1739 bbox.ymax = vMaxY;
1740 }
1741
1742 //////////////////////////////////////////////////////////////////////////
1743 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1744 /// Offsets BBox for conservative rast
1745 template <>
1746 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1747 {
1748 // FE conservative rast traits
1749 typedef FEConservativeRastT CT;
1750
1751 simdscalari vMinX = vX[0];
1752 vMinX = _simd_min_epi32(vMinX, vX[1]);
1753 vMinX = _simd_min_epi32(vMinX, vX[2]);
1754
1755 simdscalari vMaxX = vX[0];
1756 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1757 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1758
1759 simdscalari vMinY = vY[0];
1760 vMinY = _simd_min_epi32(vMinY, vY[1]);
1761 vMinY = _simd_min_epi32(vMinY, vY[2]);
1762
1763 simdscalari vMaxY = vY[0];
1764 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1765 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1766
1767 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1768 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1769 bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1770 bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1771 bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1772 bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1773 }
1774
1775 //////////////////////////////////////////////////////////////////////////
1776 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1777 /// culling, viewport transform, etc.
1778 /// @param pDC - pointer to draw context.
1779 /// @param pa - The primitive assembly object.
1780 /// @param workerId - thread's worker id. Even thread has a unique id.
1781 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1782 /// @param primID - Primitive ID for each triangle.
1783 /// @param viewportIdx - viewport array index for each triangle.
1784 /// @tparam CT - ConservativeRastFETraits
1785 template <typename CT>
1786 void BinTriangles(
1787 DRAW_CONTEXT *pDC,
1788 PA_STATE& pa,
1789 uint32_t workerId,
1790 simdvector tri[3],
1791 uint32_t triMask,
1792 simdscalari primID,
1793 simdscalari viewportIdx)
1794 {
1795 SWR_CONTEXT *pContext = pDC->pContext;
1796
1797 AR_BEGIN(FEBinTriangles, pDC->drawId);
1798
1799 const API_STATE& state = GetApiState(pDC);
1800 const SWR_RASTSTATE& rastState = state.rastState;
1801 const SWR_FRONTEND_STATE& feState = state.frontendState;
1802 const SWR_GS_STATE& gsState = state.gsState;
1803 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1804
1805
1806 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1807 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1808 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1809
1810 if (feState.vpTransformDisable)
1811 {
1812 // RHW is passed in directly when VP transform is disabled
1813 vRecipW0 = tri[0].v[3];
1814 vRecipW1 = tri[1].v[3];
1815 vRecipW2 = tri[2].v[3];
1816 }
1817 else
1818 {
1819 // Perspective divide
1820 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1821 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1822 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1823
1824 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1825 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1826 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1827
1828 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1829 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1830 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1831
1832 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1833 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1834 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1835
1836 // Viewport transform to screen space coords
1837 if (state.gsState.emitsViewportArrayIndex)
1838 {
1839 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1840 }
1841 else
1842 {
1843 viewportTransform<3>(tri, state.vpMatrices);
1844 }
1845 }
1846
1847 // Adjust for pixel center location
1848 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1849 tri[0].x = _simd_add_ps(tri[0].x, offset);
1850 tri[0].y = _simd_add_ps(tri[0].y, offset);
1851
1852 tri[1].x = _simd_add_ps(tri[1].x, offset);
1853 tri[1].y = _simd_add_ps(tri[1].y, offset);
1854
1855 tri[2].x = _simd_add_ps(tri[2].x, offset);
1856 tri[2].y = _simd_add_ps(tri[2].y, offset);
1857
1858 simdscalari vXi[3], vYi[3];
1859 // Set vXi, vYi to required fixed point precision
1860 FPToFixedPoint(tri, vXi, vYi);
1861
1862 // triangle setup
1863 simdscalari vAi[3], vBi[3];
1864 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1865
1866 // determinant
1867 simdscalari vDet[2];
1868 calcDeterminantIntVertical(vAi, vBi, vDet);
1869
1870 // cull zero area
1871 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1872 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1873
1874 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1875
1876 uint32_t origTriMask = triMask;
1877 // don't cull degenerate triangles if we're conservatively rasterizing
1878 if(!CT::IsConservativeT::value)
1879 {
1880 triMask &= ~cullZeroAreaMask;
1881 }
1882
1883 // determine front winding tris
1884 // CW +det
1885 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1886 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1887 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1888 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1889
1890 uint32_t frontWindingTris;
1891 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1892 {
1893 frontWindingTris = cwTriMask;
1894 }
1895 else
1896 {
1897 frontWindingTris = ~cwTriMask;
1898 }
1899
1900 // cull
1901 uint32_t cullTris;
1902 switch ((SWR_CULLMODE)rastState.cullMode)
1903 {
1904 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1905 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1906 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1907 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1908 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1909 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1910 }
1911
1912 triMask &= ~cullTris;
1913
1914 if (origTriMask ^ triMask)
1915 {
1916 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1917 }
1918
1919 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1920 // compute per tri backface
1921 uint32_t frontFaceMask = frontWindingTris;
1922 uint32_t *pPrimID = (uint32_t *)&primID;
1923 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1924 DWORD triIndex = 0;
1925 // for center sample pattern, all samples are at pixel center; calculate coverage
1926 // once at center and broadcast the results in the backend
1927 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1928 uint32_t edgeEnable;
1929 PFN_WORK_FUNC pfnWork;
1930 if(CT::IsConservativeT::value)
1931 {
1932 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1933 // used to call the appropriate templated rasterizer function
1934 if(cullZeroAreaMask > 0)
1935 {
1936 // e0 = v1-v0
1937 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1938 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1939 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1940
1941 // e1 = v2-v1
1942 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1943 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1944 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1945
1946 // e2 = v0-v2
1947 // if v0 == v1 & v1 == v2, v0 == v2
1948 uint32_t e2Mask = e0Mask & e1Mask;
1949 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1950
1951 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1952 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1953 e0Mask = pdep_u32(e0Mask, 0x00249249);
1954 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1955 e1Mask = pdep_u32(e1Mask, 0x00492492);
1956 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1957 e2Mask = pdep_u32(e2Mask, 0x00924924);
1958
1959 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1960 }
1961 else
1962 {
1963 edgeEnable = 0x00FFFFFF;
1964 }
1965 }
1966 else
1967 {
1968 // degenerate triangles won't be sent to rasterizer; just enable all edges
1969 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1970 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1971 (state.scissorsTileAligned == false));
1972 }
1973
1974 if (!triMask)
1975 {
1976 goto endBinTriangles;
1977 }
1978
1979 // Calc bounding box of triangles
1980 simdBBox bbox;
1981 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1982
1983 // determine if triangle falls between pixel centers and discard
1984 // only discard for non-MSAA case and when conservative rast is disabled
1985 // (xmin + 127) & ~255
1986 // (xmax + 128) & ~255
1987 if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1988 {
1989 origTriMask = triMask;
1990
1991 int cullCenterMask;
1992 {
1993 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
1994 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
1995 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
1996 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
1997
1998 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
1999
2000 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
2001 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
2002 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
2003 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
2004
2005 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
2006 vMaskV = _simd_or_si(vMaskH, vMaskV);
2007 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
2008 }
2009
2010 triMask &= ~cullCenterMask;
2011
2012 if(origTriMask ^ triMask)
2013 {
2014 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
2015 }
2016 }
2017
2018 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2019 // Gather the AOS effective scissor rects based on the per-prim VP index.
2020 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2021 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2022 if (state.gsState.emitsViewportArrayIndex)
2023 {
2024 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2025 scisXmin, scisYmin, scisXmax, scisYmax);
2026 }
2027 else // broadcast fast path for non-VPAI case.
2028 {
2029 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2030 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2031 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2032 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2033 }
2034
2035 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2036 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2037 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2038 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2039
2040 if(CT::IsConservativeT::value)
2041 {
2042 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
2043 // some area. Bump the xmax/ymax edges out
2044 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
2045 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
2046 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
2047 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
2048 }
2049
2050 // Cull tris completely outside scissor
2051 {
2052 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2053 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2054 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2055 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2056 triMask = triMask & ~maskOutsideScissor;
2057 }
2058
2059 if (!triMask)
2060 {
2061 goto endBinTriangles;
2062 }
2063
2064 // Convert triangle bbox to macrotile units.
2065 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2066 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2067 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2068 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2069
2070 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2071 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2072 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2073 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2074 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2075
2076 // transpose verts needed for backend
2077 /// @todo modify BE to take non-transformed verts
2078 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2079 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2080 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2081 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2082 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2083
2084 // store render target array index
2085 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2086 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2087 {
2088 simdvector vRtai[3];
2089 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2090 simdscalari vRtaii;
2091 vRtaii = _simd_castps_si(vRtai[0].x);
2092 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2093 }
2094 else
2095 {
2096 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2097 }
2098
2099 // scan remaining valid triangles and bin each separately
2100 while (_BitScanForward(&triIndex, triMask))
2101 {
2102 uint32_t linkageCount = state.backendState.numAttributes;
2103 uint32_t numScalarAttribs = linkageCount * 4;
2104
2105 BE_WORK work;
2106 work.type = DRAW;
2107
2108 bool isDegenerate;
2109 if(CT::IsConservativeT::value)
2110 {
2111 // only rasterize valid edges if we have a degenerate primitive
2112 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2113 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2114 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2115 (state.scissorsTileAligned == false));
2116
2117 // Degenerate triangles are required to be constant interpolated
2118 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2119 }
2120 else
2121 {
2122 isDegenerate = false;
2123 work.pfnWork = pfnWork;
2124 }
2125
2126 // Select attribute processor
2127 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2128 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
2129
2130 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2131
2132 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2133 desc.triFlags.primID = pPrimID[triIndex];
2134 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2135 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
2136
2137 auto pArena = pDC->pArena;
2138 SWR_ASSERT(pArena != nullptr);
2139
2140 // store active attribs
2141 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2142 desc.pAttribs = pAttribs;
2143 desc.numAttribs = linkageCount;
2144 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2145
2146 // store triangle vertex data
2147 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2148
2149 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2150 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2151 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2152 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2153
2154 // store user clip distances
2155 if (rastState.clipDistanceMask)
2156 {
2157 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2158 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2159 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2160 }
2161
2162 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2163 {
2164 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2165 {
2166 #if KNOB_ENABLE_TOSS_POINTS
2167 if (!KNOB_TOSS_SETUP_TRIS)
2168 #endif
2169 {
2170 pTileMgr->enqueue(x, y, &work);
2171 }
2172 }
2173 }
2174 triMask &= ~(1 << triIndex);
2175 }
2176
2177 endBinTriangles:
2178 AR_END(FEBinTriangles, 1);
2179 }
2180
2181 struct FEBinTrianglesChooser
2182 {
2183 typedef PFN_PROCESS_PRIMS FuncType;
2184
2185 template <typename... ArgsB>
2186 static FuncType GetFunc()
2187 {
2188 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2189 }
2190 };
2191
2192 // Selector for correct templated BinTrinagles function
2193 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2194 {
2195 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2196 }
2197
2198 //////////////////////////////////////////////////////////////////////////
2199 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2200 /// @param pDC - pointer to draw context.
2201 /// @param pa - The primitive assembly object.
2202 /// @param workerId - thread's worker id. Even thread has a unique id.
2203 /// @param tri - Contains point position data for SIMDs worth of points.
2204 /// @param primID - Primitive ID for each point.
2205 void BinPoints(
2206 DRAW_CONTEXT *pDC,
2207 PA_STATE& pa,
2208 uint32_t workerId,
2209 simdvector prim[3],
2210 uint32_t primMask,
2211 simdscalari primID,
2212 simdscalari viewportIdx)
2213 {
2214 SWR_CONTEXT *pContext = pDC->pContext;
2215
2216 AR_BEGIN(FEBinPoints, pDC->drawId);
2217
2218 simdvector& primVerts = prim[0];
2219
2220 const API_STATE& state = GetApiState(pDC);
2221 const SWR_FRONTEND_STATE& feState = state.frontendState;
2222 const SWR_GS_STATE& gsState = state.gsState;
2223 const SWR_RASTSTATE& rastState = state.rastState;
2224 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2225
2226 // Select attribute processor
2227 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2228 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2229
2230 if (!feState.vpTransformDisable)
2231 {
2232 // perspective divide
2233 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2234 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2235 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2236 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2237
2238 // viewport transform to screen coords
2239 if (state.gsState.emitsViewportArrayIndex)
2240 {
2241 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2242 }
2243 else
2244 {
2245 viewportTransform<1>(&primVerts, state.vpMatrices);
2246 }
2247 }
2248
2249 // adjust for pixel center location
2250 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2251 primVerts.x = _simd_add_ps(primVerts.x, offset);
2252 primVerts.y = _simd_add_ps(primVerts.y, offset);
2253
2254 // convert to fixed point
2255 simdscalari vXi, vYi;
2256 vXi = fpToFixedPointVertical(primVerts.x);
2257 vYi = fpToFixedPointVertical(primVerts.y);
2258
2259 if (CanUseSimplePoints(pDC))
2260 {
2261 // adjust for ymin-xmin rule
2262 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2263 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2264
2265 // cull points off the ymin-xmin edge of the viewport
2266 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2267 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2268
2269 // compute macro tile coordinates
2270 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2271 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2272
2273 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2274 _simd_store_si((simdscalari*)aMacroX, macroX);
2275 _simd_store_si((simdscalari*)aMacroY, macroY);
2276
2277 // compute raster tile coordinates
2278 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2279 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2280
2281 // compute raster tile relative x,y for coverage mask
2282 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2283 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2284
2285 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2286 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2287
2288 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2289 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2290 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2291 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2292
2293 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2294 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2295 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2296 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2297
2298 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2299 _simd_store_ps((float*)aZ, primVerts.z);
2300
2301 // store render target array index
2302 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2303 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2304 {
2305 simdvector vRtai;
2306 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2307 simdscalari vRtaii = _simd_castps_si(vRtai.x);
2308 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2309 }
2310 else
2311 {
2312 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2313 }
2314
2315 uint32_t *pPrimID = (uint32_t *)&primID;
2316 DWORD primIndex = 0;
2317
2318 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2319
2320 // scan remaining valid triangles and bin each separately
2321 while (_BitScanForward(&primIndex, primMask))
2322 {
2323 uint32_t linkageCount = backendState.numAttributes;
2324 uint32_t numScalarAttribs = linkageCount * 4;
2325
2326 BE_WORK work;
2327 work.type = DRAW;
2328
2329 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2330
2331 // points are always front facing
2332 desc.triFlags.frontFacing = 1;
2333 desc.triFlags.primID = pPrimID[primIndex];
2334 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2335 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2336
2337 work.pfnWork = RasterizeSimplePoint;
2338
2339 auto pArena = pDC->pArena;
2340 SWR_ASSERT(pArena != nullptr);
2341
2342 // store attributes
2343 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2344 desc.pAttribs = pAttribs;
2345 desc.numAttribs = linkageCount;
2346
2347 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2348
2349 // store raster tile aligned x, y, perspective correct z
2350 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2351 desc.pTriBuffer = pTriBuffer;
2352 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2353 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2354 *pTriBuffer = aZ[primIndex];
2355
2356 uint32_t tX = aTileRelativeX[primIndex];
2357 uint32_t tY = aTileRelativeY[primIndex];
2358
2359 // pack the relative x,y into the coverageMask, the rasterizer will
2360 // generate the true coverage mask from it
2361 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2362
2363 // bin it
2364 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2365 #if KNOB_ENABLE_TOSS_POINTS
2366 if (!KNOB_TOSS_SETUP_TRIS)
2367 #endif
2368 {
2369 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2370 }
2371 primMask &= ~(1 << primIndex);
2372 }
2373 }
2374 else
2375 {
2376 // non simple points need to be potentially binned to multiple macro tiles
2377 simdscalar vPointSize;
2378 if (rastState.pointParam)
2379 {
2380 simdvector size[3];
2381 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2382 vPointSize = size[0].x;
2383 }
2384 else
2385 {
2386 vPointSize = _simd_set1_ps(rastState.pointSize);
2387 }
2388
2389 // bloat point to bbox
2390 simdBBox bbox;
2391 bbox.xmin = bbox.xmax = vXi;
2392 bbox.ymin = bbox.ymax = vYi;
2393
2394 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2395 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2396 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2397 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2398 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2399 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2400
2401 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2402 // Gather the AOS effective scissor rects based on the per-prim VP index.
2403 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2404 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2405 if (state.gsState.emitsViewportArrayIndex)
2406 {
2407 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2408 scisXmin, scisYmin, scisXmax, scisYmax);
2409 }
2410 else // broadcast fast path for non-VPAI case.
2411 {
2412 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2413 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2414 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2415 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2416 }
2417
2418 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2419 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2420 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2421 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2422
2423 // Cull bloated points completely outside scissor
2424 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2425 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2426 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2427 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2428 primMask = primMask & ~maskOutsideScissor;
2429
2430 // Convert bbox to macrotile units.
2431 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2432 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2433 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2434 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2435
2436 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2437 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2438 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2439 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2440 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2441
2442 // store render target array index
2443 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2444 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2445 {
2446 simdvector vRtai[2];
2447 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2448 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2449 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2450 }
2451 else
2452 {
2453 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2454 }
2455
2456 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2457 _simd_store_ps((float*)aPointSize, vPointSize);
2458
2459 uint32_t *pPrimID = (uint32_t *)&primID;
2460
2461 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2462 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2463 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2464
2465 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2466 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2467 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2468
2469 // scan remaining valid prims and bin each separately
2470 const SWR_BACKEND_STATE& backendState = state.backendState;
2471 DWORD primIndex;
2472 while (_BitScanForward(&primIndex, primMask))
2473 {
2474 uint32_t linkageCount = backendState.numAttributes;
2475 uint32_t numScalarAttribs = linkageCount * 4;
2476
2477 BE_WORK work;
2478 work.type = DRAW;
2479
2480 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2481
2482 desc.triFlags.frontFacing = 1;
2483 desc.triFlags.primID = pPrimID[primIndex];
2484 desc.triFlags.pointSize = aPointSize[primIndex];
2485 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2486 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2487
2488 work.pfnWork = RasterizeTriPoint;
2489
2490 auto pArena = pDC->pArena;
2491 SWR_ASSERT(pArena != nullptr);
2492
2493 // store active attribs
2494 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2495 desc.numAttribs = linkageCount;
2496 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2497
2498 // store point vertex data
2499 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2500 desc.pTriBuffer = pTriBuffer;
2501 *pTriBuffer++ = aPrimVertsX[primIndex];
2502 *pTriBuffer++ = aPrimVertsY[primIndex];
2503 *pTriBuffer = aPrimVertsZ[primIndex];
2504
2505 // store user clip distances
2506 if (rastState.clipDistanceMask)
2507 {
2508 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2509 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2510 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2511 }
2512
2513 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2514 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2515 {
2516 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2517 {
2518 #if KNOB_ENABLE_TOSS_POINTS
2519 if (!KNOB_TOSS_SETUP_TRIS)
2520 #endif
2521 {
2522 pTileMgr->enqueue(x, y, &work);
2523 }
2524 }
2525 }
2526
2527 primMask &= ~(1 << primIndex);
2528 }
2529 }
2530
2531 AR_END(FEBinPoints, 1);
2532 }
2533
2534 //////////////////////////////////////////////////////////////////////////
2535 /// @brief Bin SIMD lines to the backend.
2536 /// @param pDC - pointer to draw context.
2537 /// @param pa - The primitive assembly object.
2538 /// @param workerId - thread's worker id. Even thread has a unique id.
2539 /// @param tri - Contains line position data for SIMDs worth of points.
2540 /// @param primID - Primitive ID for each line.
2541 /// @param viewportIdx - Viewport Array Index for each line.
2542 void BinLines(
2543 DRAW_CONTEXT *pDC,
2544 PA_STATE& pa,
2545 uint32_t workerId,
2546 simdvector prim[],
2547 uint32_t primMask,
2548 simdscalari primID,
2549 simdscalari viewportIdx)
2550 {
2551 SWR_CONTEXT *pContext = pDC->pContext;
2552
2553 AR_BEGIN(FEBinLines, pDC->drawId);
2554
2555 const API_STATE& state = GetApiState(pDC);
2556 const SWR_RASTSTATE& rastState = state.rastState;
2557 const SWR_FRONTEND_STATE& feState = state.frontendState;
2558 const SWR_GS_STATE& gsState = state.gsState;
2559
2560 // Select attribute processor
2561 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2562 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2563
2564 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2565 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2566
2567 if (!feState.vpTransformDisable)
2568 {
2569 // perspective divide
2570 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2571 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2572
2573 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2574 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2575
2576 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2577 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2578
2579 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2580 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2581
2582 // viewport transform to screen coords
2583 if (state.gsState.emitsViewportArrayIndex)
2584 {
2585 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2586 }
2587 else
2588 {
2589 viewportTransform<2>(prim, state.vpMatrices);
2590 }
2591 }
2592
2593 // adjust for pixel center location
2594 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2595 prim[0].x = _simd_add_ps(prim[0].x, offset);
2596 prim[0].y = _simd_add_ps(prim[0].y, offset);
2597
2598 prim[1].x = _simd_add_ps(prim[1].x, offset);
2599 prim[1].y = _simd_add_ps(prim[1].y, offset);
2600
2601 // convert to fixed point
2602 simdscalari vXi[2], vYi[2];
2603 vXi[0] = fpToFixedPointVertical(prim[0].x);
2604 vYi[0] = fpToFixedPointVertical(prim[0].y);
2605 vXi[1] = fpToFixedPointVertical(prim[1].x);
2606 vYi[1] = fpToFixedPointVertical(prim[1].y);
2607
2608 // compute x-major vs y-major mask
2609 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2610 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2611 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2612 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2613
2614 // cull zero-length lines
2615 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2616 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2617
2618 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2619
2620 uint32_t *pPrimID = (uint32_t *)&primID;
2621 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2622
2623 simdscalar vUnused = _simd_setzero_ps();
2624
2625 // Calc bounding box of lines
2626 simdBBox bbox;
2627 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2628 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2629 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2630 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2631
2632 // bloat bbox by line width along minor axis
2633 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2634 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2635 simdBBox bloatBox;
2636 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2637 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2638 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2639 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2640
2641 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2642 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2643 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2644 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2645
2646 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2647 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2648 if (state.gsState.emitsViewportArrayIndex)
2649 {
2650 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2651 scisXmin, scisYmin, scisXmax, scisYmax);
2652 }
2653 else // broadcast fast path for non-VPAI case.
2654 {
2655 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2656 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2657 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2658 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2659 }
2660
2661 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2662 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2663 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2664 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2665
2666 // Cull prims completely outside scissor
2667 {
2668 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2669 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2670 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2671 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2672 primMask = primMask & ~maskOutsideScissor;
2673 }
2674
2675 if (!primMask)
2676 {
2677 goto endBinLines;
2678 }
2679
2680 // Convert triangle bbox to macrotile units.
2681 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2682 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2683 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2684 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2685
2686 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2687 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2688 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2689 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2690 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2691
2692 // transpose verts needed for backend
2693 /// @todo modify BE to take non-transformed verts
2694 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2695 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2696 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2697 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2698 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2699
2700 // store render target array index
2701 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2702 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2703 {
2704 simdvector vRtai[2];
2705 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2706 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2707 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2708 }
2709 else
2710 {
2711 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2712 }
2713
2714 // scan remaining valid prims and bin each separately
2715 DWORD primIndex;
2716 while (_BitScanForward(&primIndex, primMask))
2717 {
2718 uint32_t linkageCount = state.backendState.numAttributes;
2719 uint32_t numScalarAttribs = linkageCount * 4;
2720
2721 BE_WORK work;
2722 work.type = DRAW;
2723
2724 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2725
2726 desc.triFlags.frontFacing = 1;
2727 desc.triFlags.primID = pPrimID[primIndex];
2728 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2729 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2730 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2731
2732 work.pfnWork = RasterizeLine;
2733
2734 auto pArena = pDC->pArena;
2735 SWR_ASSERT(pArena != nullptr);
2736
2737 // store active attribs
2738 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2739 desc.numAttribs = linkageCount;
2740 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2741
2742 // store line vertex data
2743 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2744 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2745 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2746 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2747 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2748
2749 // store user clip distances
2750 if (rastState.clipDistanceMask)
2751 {
2752 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2753 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2754 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2755 }
2756
2757 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2758 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2759 {
2760 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2761 {
2762 #if KNOB_ENABLE_TOSS_POINTS
2763 if (!KNOB_TOSS_SETUP_TRIS)
2764 #endif
2765 {
2766 pTileMgr->enqueue(x, y, &work);
2767 }
2768 }
2769 }
2770
2771 primMask &= ~(1 << primIndex);
2772 }
2773
2774 endBinLines:
2775
2776 AR_END(FEBinLines, 1);
2777 }