a49ec7a9fbb2f2f7e57dc32664ccffa2b7c20a6a
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
37 #include "utils.h"
38 #include "threads.h"
39 #include "pa.h"
40 #include "clip.h"
41 #include "tilemgr.h"
42 #include "tessellator.h"
43 #include <limits>
44
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE uint32_t GenMask(uint32_t numBits)
48 {
49 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
55 /// raster state.
56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
69 void ProcessSync(
70 SWR_CONTEXT *pContext,
71 DRAW_CONTEXT *pDC,
72 uint32_t workerId,
73 void *pUserData)
74 {
75 BE_WORK work;
76 work.type = SYNC;
77 work.pfnWork = ProcessSyncBE;
78
79 MacroTileMgr *pTileMgr = pDC->pTileMgr;
80 pTileMgr->enqueue(0, 0, &work);
81 }
82
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrClearRenderTarget.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to clear callback.
89 /// @todo This should go away when we switch this to use compute threading.
90 void ProcessClear(
91 SWR_CONTEXT *pContext,
92 DRAW_CONTEXT *pDC,
93 uint32_t workerId,
94 void *pUserData)
95 {
96 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
97 MacroTileMgr *pTileMgr = pDC->pTileMgr;
98
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
102 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
103 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
104 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
105
106 BE_WORK work;
107 work.type = CLEAR;
108 work.pfnWork = ProcessClearBE;
109 work.desc.clear = *pDesc;
110
111 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
112 {
113 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
114 {
115 pTileMgr->enqueue(x, y, &work);
116 }
117 }
118 }
119
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
127 void ProcessStoreTiles(
128 SWR_CONTEXT *pContext,
129 DRAW_CONTEXT *pDC,
130 uint32_t workerId,
131 void *pUserData)
132 {
133 RDTSC_START(FEProcessStoreTiles);
134 MacroTileMgr *pTileMgr = pDC->pTileMgr;
135 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
136
137 // queue a store to each macro tile
138 // compute macro tile bounds for the specified rect
139 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
140 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
141 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
142 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
143
144 // store tiles
145 BE_WORK work;
146 work.type = STORETILES;
147 work.pfnWork = ProcessStoreTileBE;
148 work.desc.storeTiles = *pDesc;
149
150 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
151 {
152 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
153 {
154 pTileMgr->enqueue(x, y, &work);
155 }
156 }
157
158 RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
159 }
160
161 //////////////////////////////////////////////////////////////////////////
162 /// @brief FE handler for SwrInvalidateTiles.
163 /// @param pContext - pointer to SWR context.
164 /// @param pDC - pointer to draw context.
165 /// @param workerId - thread's worker id. Even thread has a unique id.
166 /// @param pUserData - Pointer to user data passed back to callback.
167 /// @todo This should go away when we switch this to use compute threading.
168 void ProcessDiscardInvalidateTiles(
169 SWR_CONTEXT *pContext,
170 DRAW_CONTEXT *pDC,
171 uint32_t workerId,
172 void *pUserData)
173 {
174 RDTSC_START(FEProcessInvalidateTiles);
175 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
176 MacroTileMgr *pTileMgr = pDC->pTileMgr;
177
178 // compute macro tile bounds for the specified rect
179 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
180 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
181 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
182 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
183
184 if (pDesc->fullTilesOnly == false)
185 {
186 // include partial tiles
187 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
188 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
189 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
190 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
191 }
192
193 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
194 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
195
196 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
197 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
198
199 // load tiles
200 BE_WORK work;
201 work.type = DISCARDINVALIDATETILES;
202 work.pfnWork = ProcessDiscardInvalidateTilesBE;
203 work.desc.discardInvalidateTiles = *pDesc;
204
205 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
206 {
207 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
208 {
209 pTileMgr->enqueue(x, y, &work);
210 }
211 }
212
213 RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
214 }
215
216 //////////////////////////////////////////////////////////////////////////
217 /// @brief Computes the number of primitives given the number of verts.
218 /// @param mode - primitive topology for draw operation.
219 /// @param numPrims - number of vertices or indices for draw.
220 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
221 uint32_t GetNumPrims(
222 PRIMITIVE_TOPOLOGY mode,
223 uint32_t numPrims)
224 {
225 switch (mode)
226 {
227 case TOP_POINT_LIST: return numPrims;
228 case TOP_TRIANGLE_LIST: return numPrims / 3;
229 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
230 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
231 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
232 case TOP_QUAD_LIST: return numPrims / 4;
233 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
234 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
235 case TOP_LINE_LIST: return numPrims / 2;
236 case TOP_LINE_LOOP: return numPrims;
237 case TOP_RECT_LIST: return numPrims / 3;
238 case TOP_LINE_LIST_ADJ: return numPrims / 4;
239 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
240 case TOP_TRI_LIST_ADJ: return numPrims / 6;
241 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
242
243 case TOP_PATCHLIST_1:
244 case TOP_PATCHLIST_2:
245 case TOP_PATCHLIST_3:
246 case TOP_PATCHLIST_4:
247 case TOP_PATCHLIST_5:
248 case TOP_PATCHLIST_6:
249 case TOP_PATCHLIST_7:
250 case TOP_PATCHLIST_8:
251 case TOP_PATCHLIST_9:
252 case TOP_PATCHLIST_10:
253 case TOP_PATCHLIST_11:
254 case TOP_PATCHLIST_12:
255 case TOP_PATCHLIST_13:
256 case TOP_PATCHLIST_14:
257 case TOP_PATCHLIST_15:
258 case TOP_PATCHLIST_16:
259 case TOP_PATCHLIST_17:
260 case TOP_PATCHLIST_18:
261 case TOP_PATCHLIST_19:
262 case TOP_PATCHLIST_20:
263 case TOP_PATCHLIST_21:
264 case TOP_PATCHLIST_22:
265 case TOP_PATCHLIST_23:
266 case TOP_PATCHLIST_24:
267 case TOP_PATCHLIST_25:
268 case TOP_PATCHLIST_26:
269 case TOP_PATCHLIST_27:
270 case TOP_PATCHLIST_28:
271 case TOP_PATCHLIST_29:
272 case TOP_PATCHLIST_30:
273 case TOP_PATCHLIST_31:
274 case TOP_PATCHLIST_32:
275 return numPrims / (mode - TOP_PATCHLIST_BASE);
276
277 case TOP_POLYGON:
278 case TOP_POINT_LIST_BF:
279 case TOP_LINE_STRIP_CONT:
280 case TOP_LINE_STRIP_BF:
281 case TOP_LINE_STRIP_CONT_BF:
282 case TOP_TRIANGLE_FAN_NOSTIPPLE:
283 case TOP_TRI_STRIP_REVERSE:
284 case TOP_PATCHLIST_BASE:
285 case TOP_UNKNOWN:
286 SWR_ASSERT(false, "Unsupported topology: %d", mode);
287 return 0;
288 }
289
290 return 0;
291 }
292
293 //////////////////////////////////////////////////////////////////////////
294 /// @brief Computes the number of verts given the number of primitives.
295 /// @param mode - primitive topology for draw operation.
296 /// @param numPrims - number of primitives for draw.
297 uint32_t GetNumVerts(
298 PRIMITIVE_TOPOLOGY mode,
299 uint32_t numPrims)
300 {
301 switch (mode)
302 {
303 case TOP_POINT_LIST: return numPrims;
304 case TOP_TRIANGLE_LIST: return numPrims * 3;
305 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
306 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
307 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
308 case TOP_QUAD_LIST: return numPrims * 4;
309 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
310 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
311 case TOP_LINE_LIST: return numPrims * 2;
312 case TOP_LINE_LOOP: return numPrims;
313 case TOP_RECT_LIST: return numPrims * 3;
314 case TOP_LINE_LIST_ADJ: return numPrims * 4;
315 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
316 case TOP_TRI_LIST_ADJ: return numPrims * 6;
317 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
318
319 case TOP_PATCHLIST_1:
320 case TOP_PATCHLIST_2:
321 case TOP_PATCHLIST_3:
322 case TOP_PATCHLIST_4:
323 case TOP_PATCHLIST_5:
324 case TOP_PATCHLIST_6:
325 case TOP_PATCHLIST_7:
326 case TOP_PATCHLIST_8:
327 case TOP_PATCHLIST_9:
328 case TOP_PATCHLIST_10:
329 case TOP_PATCHLIST_11:
330 case TOP_PATCHLIST_12:
331 case TOP_PATCHLIST_13:
332 case TOP_PATCHLIST_14:
333 case TOP_PATCHLIST_15:
334 case TOP_PATCHLIST_16:
335 case TOP_PATCHLIST_17:
336 case TOP_PATCHLIST_18:
337 case TOP_PATCHLIST_19:
338 case TOP_PATCHLIST_20:
339 case TOP_PATCHLIST_21:
340 case TOP_PATCHLIST_22:
341 case TOP_PATCHLIST_23:
342 case TOP_PATCHLIST_24:
343 case TOP_PATCHLIST_25:
344 case TOP_PATCHLIST_26:
345 case TOP_PATCHLIST_27:
346 case TOP_PATCHLIST_28:
347 case TOP_PATCHLIST_29:
348 case TOP_PATCHLIST_30:
349 case TOP_PATCHLIST_31:
350 case TOP_PATCHLIST_32:
351 return numPrims * (mode - TOP_PATCHLIST_BASE);
352
353 case TOP_POLYGON:
354 case TOP_POINT_LIST_BF:
355 case TOP_LINE_STRIP_CONT:
356 case TOP_LINE_STRIP_BF:
357 case TOP_LINE_STRIP_CONT_BF:
358 case TOP_TRIANGLE_FAN_NOSTIPPLE:
359 case TOP_TRI_STRIP_REVERSE:
360 case TOP_PATCHLIST_BASE:
361 case TOP_UNKNOWN:
362 SWR_ASSERT(false, "Unsupported topology: %d", mode);
363 return 0;
364 }
365
366 return 0;
367 }
368
369 //////////////////////////////////////////////////////////////////////////
370 /// @brief Return number of verts per primitive.
371 /// @param topology - topology
372 /// @param includeAdjVerts - include adjacent verts in primitive vertices
373 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
374 {
375 uint32_t numVerts = 0;
376 switch (topology)
377 {
378 case TOP_POINT_LIST:
379 case TOP_POINT_LIST_BF:
380 numVerts = 1;
381 break;
382 case TOP_LINE_LIST:
383 case TOP_LINE_STRIP:
384 case TOP_LINE_LIST_ADJ:
385 case TOP_LINE_LOOP:
386 case TOP_LINE_STRIP_CONT:
387 case TOP_LINE_STRIP_BF:
388 case TOP_LISTSTRIP_ADJ:
389 numVerts = 2;
390 break;
391 case TOP_TRIANGLE_LIST:
392 case TOP_TRIANGLE_STRIP:
393 case TOP_TRIANGLE_FAN:
394 case TOP_TRI_LIST_ADJ:
395 case TOP_TRI_STRIP_ADJ:
396 case TOP_TRI_STRIP_REVERSE:
397 case TOP_RECT_LIST:
398 numVerts = 3;
399 break;
400 case TOP_QUAD_LIST:
401 case TOP_QUAD_STRIP:
402 numVerts = 4;
403 break;
404 case TOP_PATCHLIST_1:
405 case TOP_PATCHLIST_2:
406 case TOP_PATCHLIST_3:
407 case TOP_PATCHLIST_4:
408 case TOP_PATCHLIST_5:
409 case TOP_PATCHLIST_6:
410 case TOP_PATCHLIST_7:
411 case TOP_PATCHLIST_8:
412 case TOP_PATCHLIST_9:
413 case TOP_PATCHLIST_10:
414 case TOP_PATCHLIST_11:
415 case TOP_PATCHLIST_12:
416 case TOP_PATCHLIST_13:
417 case TOP_PATCHLIST_14:
418 case TOP_PATCHLIST_15:
419 case TOP_PATCHLIST_16:
420 case TOP_PATCHLIST_17:
421 case TOP_PATCHLIST_18:
422 case TOP_PATCHLIST_19:
423 case TOP_PATCHLIST_20:
424 case TOP_PATCHLIST_21:
425 case TOP_PATCHLIST_22:
426 case TOP_PATCHLIST_23:
427 case TOP_PATCHLIST_24:
428 case TOP_PATCHLIST_25:
429 case TOP_PATCHLIST_26:
430 case TOP_PATCHLIST_27:
431 case TOP_PATCHLIST_28:
432 case TOP_PATCHLIST_29:
433 case TOP_PATCHLIST_30:
434 case TOP_PATCHLIST_31:
435 case TOP_PATCHLIST_32:
436 numVerts = topology - TOP_PATCHLIST_BASE;
437 break;
438 default:
439 SWR_ASSERT(false, "Unsupported topology: %d", topology);
440 break;
441 }
442
443 if (includeAdjVerts)
444 {
445 switch (topology)
446 {
447 case TOP_LISTSTRIP_ADJ:
448 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
449 case TOP_TRI_STRIP_ADJ:
450 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
451 default: break;
452 }
453 }
454
455 return numVerts;
456 }
457
458 //////////////////////////////////////////////////////////////////////////
459 /// @brief Generate mask from remaining work.
460 /// @param numWorkItems - Number of items being worked on by a SIMD.
461 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
462 {
463 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
464 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
465 return _simd_castps_si(vMask(mask));
466 }
467
468
469 //////////////////////////////////////////////////////////////////////////
470 /// @brief Gather scissor rect data based on per-prim viewport indices.
471 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
472 /// @param pViewportIndex - array of per-primitive vewport indexes.
473 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
474 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
475 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
476 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
477 //
478 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
479 template<size_t SimdWidth>
480 struct GatherScissors
481 {
482 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
483 simdscalari &scisXmin, simdscalari &scisYmin,
484 simdscalari &scisXmax, simdscalari &scisYmax)
485 {
486 SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
487 }
488 };
489
490 template<>
491 struct GatherScissors<8>
492 {
493 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
494 simdscalari &scisXmin, simdscalari &scisYmin,
495 simdscalari &scisXmax, simdscalari &scisYmax)
496 {
497 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
498 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
499 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
500 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
501 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
502 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
503 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
504 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
505 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
506 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
507 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
508 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
509 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
510 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
511 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
512 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
513 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
514 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
515 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
516 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
517 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
518 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
519 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
520 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
521 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
522 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
523 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
524 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
525 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
526 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
527 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
528 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
529 }
530 };
531
532 //////////////////////////////////////////////////////////////////////////
533 /// @brief StreamOut - Streams vertex data out to SO buffers.
534 /// Generally, we are only streaming out a SIMDs worth of triangles.
535 /// @param pDC - pointer to draw context.
536 /// @param workerId - thread's worker id. Even thread has a unique id.
537 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
538 static void StreamOut(
539 DRAW_CONTEXT* pDC,
540 PA_STATE& pa,
541 uint32_t workerId,
542 uint32_t* pPrimData,
543 uint32_t streamIndex)
544 {
545 RDTSC_START(FEStreamout);
546
547 const API_STATE& state = GetApiState(pDC);
548 const SWR_STREAMOUT_STATE &soState = state.soState;
549
550 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
551
552 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
553 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
554
555 SWR_STREAMOUT_CONTEXT soContext = { 0 };
556
557 // Setup buffer state pointers.
558 for (uint32_t i = 0; i < 4; ++i)
559 {
560 soContext.pBuffer[i] = &state.soBuffer[i];
561 }
562
563 uint32_t numPrims = pa.NumPrims();
564 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
565 {
566 DWORD slot = 0;
567 uint32_t soMask = soState.streamMasks[streamIndex];
568
569 // Write all entries into primitive data buffer for SOS.
570 while (_BitScanForward(&slot, soMask))
571 {
572 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
573 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
574 pa.AssembleSingle(paSlot, primIndex, attrib);
575
576 // Attribute offset is relative offset from start of vertex.
577 // Note that attributes start at slot 1 in the PA buffer. We need to write this
578 // to prim data starting at slot 0. Which is why we do (slot - 1).
579 // Also note: GL works slightly differently, and needs slot 0
580 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
581
582 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
583 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
584 {
585 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
586
587 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
588 }
589 soMask &= ~(1 << slot);
590 }
591
592 // Update pPrimData pointer
593 soContext.pPrimData = pPrimData;
594
595 // Call SOS
596 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
597 state.pfnSoFunc[streamIndex](soContext);
598 }
599
600 // Update SO write offset. The driver provides memory for the update.
601 for (uint32_t i = 0; i < 4; ++i)
602 {
603 if (state.soBuffer[i].pWriteOffset)
604 {
605 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
606 }
607
608 if (state.soBuffer[i].soWriteEnable)
609 {
610 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
611 pDC->dynState.SoWriteOffsetDirty[i] = true;
612 }
613 }
614
615 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
616 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
617
618 RDTSC_STOP(FEStreamout, 1, 0);
619 }
620
621 //////////////////////////////////////////////////////////////////////////
622 /// @brief Computes number of invocations. The current index represents
623 /// the start of the SIMD. The max index represents how much work
624 /// items are remaining. If there is less then a SIMD's xmin of work
625 /// then return the remaining amount of work.
626 /// @param curIndex - The start index for the SIMD.
627 /// @param maxIndex - The last index for all work items.
628 static INLINE uint32_t GetNumInvocations(
629 uint32_t curIndex,
630 uint32_t maxIndex)
631 {
632 uint32_t remainder = (maxIndex - curIndex);
633 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
634 }
635
636 //////////////////////////////////////////////////////////////////////////
637 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
638 /// The geometry shader will loop over each active streamout buffer, assembling
639 /// primitives for the downstream stages. When multistream output is enabled,
640 /// the generated stream ID buffer from the GS needs to be converted to a cut
641 /// buffer for the primitive assembler.
642 /// @param stream - stream id to generate the cut buffer for
643 /// @param pStreamIdBase - pointer to the stream ID buffer
644 /// @param numEmittedVerts - Number of total verts emitted by the GS
645 /// @param pCutBuffer - output buffer to write cuts to
646 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
647 {
648 SWR_ASSERT(stream < MAX_SO_STREAMS);
649
650 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
651 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
652
653 for (uint32_t b = 0; b < numOutputBytes; ++b)
654 {
655 uint8_t curInputByte = pStreamIdBase[2*b];
656 uint8_t outByte = 0;
657 for (uint32_t i = 0; i < 4; ++i)
658 {
659 if ((curInputByte & 0x3) != stream)
660 {
661 outByte |= (1 << i);
662 }
663 curInputByte >>= 2;
664 }
665
666 curInputByte = pStreamIdBase[2 * b + 1];
667 for (uint32_t i = 0; i < 4; ++i)
668 {
669 if ((curInputByte & 0x3) != stream)
670 {
671 outByte |= (1 << (i + 4));
672 }
673 curInputByte >>= 2;
674 }
675
676 *pCutBuffer++ = outByte;
677 }
678 }
679
680 THREAD SWR_GS_CONTEXT tlsGsContext;
681
682 //////////////////////////////////////////////////////////////////////////
683 /// @brief Implements GS stage.
684 /// @param pDC - pointer to draw context.
685 /// @param workerId - thread's worker id. Even thread has a unique id.
686 /// @param pa - The primitive assembly object.
687 /// @param pGsOut - output stream for GS
688 template <
689 typename HasStreamOutT,
690 typename HasRastT>
691 static void GeometryShaderStage(
692 DRAW_CONTEXT *pDC,
693 uint32_t workerId,
694 PA_STATE& pa,
695 void* pGsOut,
696 void* pCutBuffer,
697 void* pStreamCutBuffer,
698 uint32_t* pSoPrimData,
699 simdscalari primID)
700 {
701 RDTSC_START(FEGeometryShader);
702
703 const API_STATE& state = GetApiState(pDC);
704 const SWR_GS_STATE* pState = &state.gsState;
705
706 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
707 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
708
709 tlsGsContext.pStream = (uint8_t*)pGsOut;
710 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
711 tlsGsContext.PrimitiveID = primID;
712
713 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
714 simdvector attrib[MAX_ATTRIBUTES];
715
716 // assemble all attributes for the input primitive
717 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
718 {
719 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
720 pa.Assemble(attribSlot, attrib);
721
722 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
723 {
724 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
725 }
726 }
727
728 // assemble position
729 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
730 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
731 {
732 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
733 }
734
735 const uint32_t vertexStride = sizeof(simdvertex);
736 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
737 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
738 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
739 uint32_t cutPrimStride;
740 uint32_t cutInstanceStride;
741
742 if (pState->isSingleStream)
743 {
744 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
745 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
746 }
747 else
748 {
749 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
750 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
751 }
752
753 // record valid prims from the frontend to avoid over binning the newly generated
754 // prims from the GS
755 uint32_t numInputPrims = pa.NumPrims();
756
757 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
758 {
759 tlsGsContext.InstanceID = instance;
760 tlsGsContext.mask = GenerateMask(numInputPrims);
761
762 // execute the geometry shader
763 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
764
765 tlsGsContext.pStream += instanceStride;
766 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
767 }
768
769 // set up new binner and state for the GS output topology
770 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
771 if (HasRastT::value)
772 {
773 switch (pState->outputTopology)
774 {
775 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
776 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
777 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
778 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
779 }
780 }
781
782 // foreach input prim:
783 // - setup a new PA based on the emitted verts for that prim
784 // - loop over the new verts, calling PA to assemble each prim
785 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
786 uint32_t* pPrimitiveId = (uint32_t*)&primID;
787
788 uint32_t totalPrimsGenerated = 0;
789 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
790 {
791 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
792 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
793 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
794 {
795 uint32_t numEmittedVerts = pVertexCount[inputPrim];
796 if (numEmittedVerts == 0)
797 {
798 continue;
799 }
800
801 uint8_t* pBase = pInstanceBase + instance * instanceStride;
802 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
803
804 uint32_t numAttribs = state.feNumAttributes;
805
806 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
807 {
808 bool processCutVerts = false;
809
810 uint8_t* pCutBuffer = pCutBase;
811
812 // assign default stream ID, only relevant when GS is outputting a single stream
813 uint32_t streamID = 0;
814 if (pState->isSingleStream)
815 {
816 processCutVerts = true;
817 streamID = pState->singleStreamID;
818 if (streamID != stream) continue;
819 }
820 else
821 {
822 // early exit if this stream is not enabled for streamout
823 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
824 {
825 continue;
826 }
827
828 // multi-stream output, need to translate StreamID buffer to a cut buffer
829 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
830 pCutBuffer = (uint8_t*)pStreamCutBuffer;
831 processCutVerts = false;
832 }
833
834 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
835
836 while (gsPa.GetNextStreamOutput())
837 {
838 do
839 {
840 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
841
842 if (assemble)
843 {
844 totalPrimsGenerated += gsPa.NumPrims();
845
846 if (HasStreamOutT::value)
847 {
848 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
849 }
850
851 if (HasRastT::value && state.soState.streamToRasterizer == stream)
852 {
853 simdscalari vPrimId;
854 // pull primitiveID from the GS output if available
855 if (state.gsState.emitsPrimitiveID)
856 {
857 simdvector primIdAttrib[3];
858 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
859 vPrimId = _simd_castps_si(primIdAttrib[0].x);
860 }
861 else
862 {
863 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
864 }
865
866 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
867 simdscalari vViewPortIdx;
868 if (state.gsState.emitsViewportArrayIndex)
869 {
870 simdvector vpiAttrib[3];
871 gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
872
873 // OOB indices => forced to zero.
874 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
875 simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
876 vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
877
878 vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
879 }
880 else
881 {
882 vViewPortIdx = _simd_set1_epi32(0);
883 }
884
885 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
886 }
887 }
888 } while (gsPa.NextPrim());
889 }
890 }
891 }
892 }
893
894 // update GS pipeline stats
895 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
896 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
897
898 RDTSC_STOP(FEGeometryShader, 1, 0);
899 }
900
901 //////////////////////////////////////////////////////////////////////////
902 /// @brief Allocate GS buffers
903 /// @param pDC - pointer to draw context.
904 /// @param state - API state
905 /// @param ppGsOut - pointer to GS output buffer allocation
906 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
907 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
908 void **ppStreamCutBuffer)
909 {
910 auto pArena = pDC->pArena;
911 SWR_ASSERT(pArena != nullptr);
912 SWR_ASSERT(state.gsState.gsEnable);
913 // allocate arena space to hold GS output verts
914 // @todo pack attribs
915 // @todo support multiple streams
916 const uint32_t vertexStride = sizeof(simdvertex);
917 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
918 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
919 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
920
921 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
922 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
923 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
924 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
925
926 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
927 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
928
929 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
930 if (state.gsState.isSingleStream)
931 {
932 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
933 *ppStreamCutBuffer = nullptr;
934 }
935 else
936 {
937 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
938 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
939 }
940
941 }
942
943 //////////////////////////////////////////////////////////////////////////
944 /// @brief Contains all data generated by the HS and passed to the
945 /// tessellator and DS.
946 struct TessellationThreadLocalData
947 {
948 SWR_HS_CONTEXT hsContext;
949 ScalarPatch patchData[KNOB_SIMD_WIDTH];
950 void* pTxCtx;
951 size_t tsCtxSize;
952
953 simdscalar* pDSOutput;
954 size_t numDSOutputVectors;
955 };
956
957 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
958
959 //////////////////////////////////////////////////////////////////////////
960 /// @brief Allocate tessellation data for this worker thread.
961 INLINE
962 static void AllocateTessellationData(SWR_CONTEXT* pContext)
963 {
964 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
965 if (gt_pTessellationThreadData == nullptr)
966 {
967 gt_pTessellationThreadData = (TessellationThreadLocalData*)
968 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
969 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
970 }
971 }
972
973 //////////////////////////////////////////////////////////////////////////
974 /// @brief Implements Tessellation Stages.
975 /// @param pDC - pointer to draw context.
976 /// @param workerId - thread's worker id. Even thread has a unique id.
977 /// @param pa - The primitive assembly object.
978 /// @param pGsOut - output stream for GS
979 template <
980 typename HasGeometryShaderT,
981 typename HasStreamOutT,
982 typename HasRastT>
983 static void TessellationStages(
984 DRAW_CONTEXT *pDC,
985 uint32_t workerId,
986 PA_STATE& pa,
987 void* pGsOut,
988 void* pCutBuffer,
989 void* pCutStreamBuffer,
990 uint32_t* pSoPrimData,
991 simdscalari primID)
992 {
993 const API_STATE& state = GetApiState(pDC);
994 const SWR_TS_STATE& tsState = state.tsState;
995
996 SWR_ASSERT(gt_pTessellationThreadData);
997
998 HANDLE tsCtx = TSInitCtx(
999 tsState.domain,
1000 tsState.partitioning,
1001 tsState.tsOutputTopology,
1002 gt_pTessellationThreadData->pTxCtx,
1003 gt_pTessellationThreadData->tsCtxSize);
1004 if (tsCtx == nullptr)
1005 {
1006 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1007 tsCtx = TSInitCtx(
1008 tsState.domain,
1009 tsState.partitioning,
1010 tsState.tsOutputTopology,
1011 gt_pTessellationThreadData->pTxCtx,
1012 gt_pTessellationThreadData->tsCtxSize);
1013 }
1014 SWR_ASSERT(tsCtx);
1015
1016 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1017 if (HasRastT::value)
1018 {
1019 switch (tsState.postDSTopology)
1020 {
1021 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1022 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1023 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1024 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1025 }
1026 }
1027
1028 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1029 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1030 hsContext.PrimitiveID = primID;
1031
1032 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1033 // Max storage for one attribute for an entire simdprimitive
1034 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1035
1036 // assemble all attributes for the input primitives
1037 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1038 {
1039 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1040 pa.Assemble(attribSlot, simdattrib);
1041
1042 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1043 {
1044 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1045 }
1046 }
1047
1048 #if defined(_DEBUG)
1049 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1050 #endif
1051
1052 uint32_t numPrims = pa.NumPrims();
1053 hsContext.mask = GenerateMask(numPrims);
1054
1055 // Run the HS
1056 RDTSC_START(FEHullShader);
1057 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1058 RDTSC_STOP(FEHullShader, 0, 0);
1059
1060 UPDATE_STAT_FE(HsInvocations, numPrims);
1061
1062 const uint32_t* pPrimId = (const uint32_t*)&primID;
1063
1064 for (uint32_t p = 0; p < numPrims; ++p)
1065 {
1066 // Run Tessellator
1067 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1068 RDTSC_START(FETessellation);
1069 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1070 RDTSC_STOP(FETessellation, 0, 0);
1071
1072 if (tsData.NumPrimitives == 0)
1073 {
1074 continue;
1075 }
1076 SWR_ASSERT(tsData.NumDomainPoints);
1077
1078 // Allocate DS Output memory
1079 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1080 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1081 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1082 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1083 {
1084 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1085 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1086 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1087 }
1088 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1089 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1090
1091 #if defined(_DEBUG)
1092 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1093 #endif
1094
1095 // Run Domain Shader
1096 SWR_DS_CONTEXT dsContext;
1097 dsContext.PrimitiveID = pPrimId[p];
1098 dsContext.pCpIn = &hsContext.pCPout[p];
1099 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1100 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1101 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1102 dsContext.vectorStride = requiredDSVectorInvocations;
1103
1104 uint32_t dsInvocations = 0;
1105
1106 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1107 {
1108 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1109
1110 RDTSC_START(FEDomainShader);
1111 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1112 RDTSC_STOP(FEDomainShader, 0, 0);
1113
1114 dsInvocations += KNOB_SIMD_WIDTH;
1115 }
1116 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1117
1118 PA_TESS tessPa(
1119 pDC,
1120 dsContext.pOutputData,
1121 dsContext.vectorStride,
1122 tsState.numDsOutputAttribs,
1123 tsData.ppIndices,
1124 tsData.NumPrimitives,
1125 tsState.postDSTopology);
1126
1127 while (tessPa.HasWork())
1128 {
1129 if (HasGeometryShaderT::value)
1130 {
1131 GeometryShaderStage<HasStreamOutT, HasRastT>(
1132 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1133 _simd_set1_epi32(dsContext.PrimitiveID));
1134 }
1135 else
1136 {
1137 if (HasStreamOutT::value)
1138 {
1139 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1140 }
1141
1142 if (HasRastT::value)
1143 {
1144 simdvector prim[3]; // Only deal with triangles, lines, or points
1145 RDTSC_START(FEPAAssemble);
1146 #if SWR_ENABLE_ASSERTS
1147 bool assemble =
1148 #endif
1149 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1150 RDTSC_STOP(FEPAAssemble, 1, 0);
1151 SWR_ASSERT(assemble);
1152
1153 SWR_ASSERT(pfnClipFunc);
1154 pfnClipFunc(pDC, tessPa, workerId, prim,
1155 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1156 }
1157 }
1158
1159 tessPa.NextPrim();
1160
1161 } // while (tessPa.HasWork())
1162 } // for (uint32_t p = 0; p < numPrims; ++p)
1163
1164 TSDestroyCtx(tsCtx);
1165 }
1166
1167 //////////////////////////////////////////////////////////////////////////
1168 /// @brief FE handler for SwrDraw.
1169 /// @tparam IsIndexedT - Is indexed drawing enabled
1170 /// @tparam HasTessellationT - Is tessellation enabled
1171 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1172 /// @tparam HasStreamOutT - Is stream-out enabled
1173 /// @tparam HasRastT - Is rasterization enabled
1174 /// @param pContext - pointer to SWR context.
1175 /// @param pDC - pointer to draw context.
1176 /// @param workerId - thread's worker id.
1177 /// @param pUserData - Pointer to DRAW_WORK
1178 template <
1179 typename IsIndexedT,
1180 typename IsCutIndexEnabledT,
1181 typename HasTessellationT,
1182 typename HasGeometryShaderT,
1183 typename HasStreamOutT,
1184 typename HasRastT>
1185 void ProcessDraw(
1186 SWR_CONTEXT *pContext,
1187 DRAW_CONTEXT *pDC,
1188 uint32_t workerId,
1189 void *pUserData)
1190 {
1191
1192 #if KNOB_ENABLE_TOSS_POINTS
1193 if (KNOB_TOSS_QUEUE_FE)
1194 {
1195 return;
1196 }
1197 #endif
1198
1199 RDTSC_START(FEProcessDraw);
1200
1201 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1202 const API_STATE& state = GetApiState(pDC);
1203 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1204 SWR_VS_CONTEXT vsContext;
1205 simdvertex vin;
1206
1207 int indexSize = 0;
1208 uint32_t endVertex = work.numVerts;
1209
1210 const int32_t* pLastRequestedIndex = nullptr;
1211 if (IsIndexedT::value)
1212 {
1213 switch (work.type)
1214 {
1215 case R32_UINT:
1216 indexSize = sizeof(uint32_t);
1217 pLastRequestedIndex = &(work.pIB[endVertex]);
1218 break;
1219 case R16_UINT:
1220 indexSize = sizeof(uint16_t);
1221 // nasty address offset to last index
1222 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1223 break;
1224 case R8_UINT:
1225 indexSize = sizeof(uint8_t);
1226 // nasty address offset to last index
1227 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1228 break;
1229 default:
1230 SWR_ASSERT(0);
1231 }
1232 }
1233 else
1234 {
1235 // No cuts, prune partial primitives.
1236 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1237 }
1238
1239 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1240 fetchInfo.pStreams = &state.vertexBuffers[0];
1241 fetchInfo.StartInstance = work.startInstance;
1242 fetchInfo.StartVertex = 0;
1243
1244 vsContext.pVin = &vin;
1245
1246 if (IsIndexedT::value)
1247 {
1248 fetchInfo.BaseVertex = work.baseVertex;
1249
1250 // if the entire index buffer isn't being consumed, set the last index
1251 // so that fetches < a SIMD wide will be masked off
1252 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1253 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1254 {
1255 fetchInfo.pLastIndex = pLastRequestedIndex;
1256 }
1257 }
1258 else
1259 {
1260 fetchInfo.StartVertex = work.startVertex;
1261 }
1262
1263 #ifdef KNOB_ENABLE_RDTSC
1264 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1265 #endif
1266
1267 void* pGsOut = nullptr;
1268 void* pCutBuffer = nullptr;
1269 void* pStreamCutBuffer = nullptr;
1270 if (HasGeometryShaderT::value)
1271 {
1272 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1273 }
1274
1275 if (HasTessellationT::value)
1276 {
1277 SWR_ASSERT(state.tsState.tsEnable == true);
1278 SWR_ASSERT(state.pfnHsFunc != nullptr);
1279 SWR_ASSERT(state.pfnDsFunc != nullptr);
1280
1281 AllocateTessellationData(pContext);
1282 }
1283 else
1284 {
1285 SWR_ASSERT(state.tsState.tsEnable == false);
1286 SWR_ASSERT(state.pfnHsFunc == nullptr);
1287 SWR_ASSERT(state.pfnDsFunc == nullptr);
1288 }
1289
1290 // allocate space for streamout input prim data
1291 uint32_t* pSoPrimData = nullptr;
1292 if (HasStreamOutT::value)
1293 {
1294 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1295 }
1296
1297 // choose primitive assembler
1298 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1299 PA_STATE& pa = paFactory.GetPA();
1300
1301 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1302 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1303 {
1304 simdscalari vIndex;
1305 uint32_t i = 0;
1306
1307 if (IsIndexedT::value)
1308 {
1309 fetchInfo.pIndices = work.pIB;
1310 }
1311 else
1312 {
1313 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1314 fetchInfo.pIndices = (const int32_t*)&vIndex;
1315 }
1316
1317 fetchInfo.CurInstance = instanceNum;
1318 vsContext.InstanceID = instanceNum;
1319
1320 while (pa.HasWork())
1321 {
1322 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1323 // So we need to keep this outside of (i < endVertex) check.
1324 simdmask* pvCutIndices = nullptr;
1325 if (IsIndexedT::value)
1326 {
1327 pvCutIndices = &pa.GetNextVsIndices();
1328 }
1329
1330 simdvertex& vout = pa.GetNextVsOutput();
1331 vsContext.pVout = &vout;
1332
1333 if (i < endVertex)
1334 {
1335
1336 // 1. Execute FS/VS for a single SIMD.
1337 RDTSC_START(FEFetchShader);
1338 state.pfnFetchFunc(fetchInfo, vin);
1339 RDTSC_STOP(FEFetchShader, 0, 0);
1340
1341 // forward fetch generated vertex IDs to the vertex shader
1342 vsContext.VertexID = fetchInfo.VertexID;
1343
1344 // Setup active mask for vertex shader.
1345 vsContext.mask = GenerateMask(endVertex - i);
1346
1347 // forward cut mask to the PA
1348 if (IsIndexedT::value)
1349 {
1350 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1351 }
1352
1353 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1354
1355 #if KNOB_ENABLE_TOSS_POINTS
1356 if (!KNOB_TOSS_FETCH)
1357 #endif
1358 {
1359 RDTSC_START(FEVertexShader);
1360 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1361 RDTSC_STOP(FEVertexShader, 0, 0);
1362
1363 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1364 }
1365 }
1366
1367 // 2. Assemble primitives given the last two SIMD.
1368 do
1369 {
1370 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1371 // PaAssemble returns false if there is not enough verts to assemble.
1372 RDTSC_START(FEPAAssemble);
1373 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1374 RDTSC_STOP(FEPAAssemble, 1, 0);
1375
1376 #if KNOB_ENABLE_TOSS_POINTS
1377 if (!KNOB_TOSS_FETCH)
1378 #endif
1379 {
1380 #if KNOB_ENABLE_TOSS_POINTS
1381 if (!KNOB_TOSS_VS)
1382 #endif
1383 {
1384 if (assemble)
1385 {
1386 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1387
1388 if (HasTessellationT::value)
1389 {
1390 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1391 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1392 }
1393 else if (HasGeometryShaderT::value)
1394 {
1395 GeometryShaderStage<HasStreamOutT, HasRastT>(
1396 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1397 }
1398 else
1399 {
1400 // If streamout is enabled then stream vertices out to memory.
1401 if (HasStreamOutT::value)
1402 {
1403 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1404 }
1405
1406 if (HasRastT::value)
1407 {
1408 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1409 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1410 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1411 }
1412 }
1413 }
1414 }
1415 }
1416 } while (pa.NextPrim());
1417
1418 i += KNOB_SIMD_WIDTH;
1419 if (IsIndexedT::value)
1420 {
1421 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1422 }
1423 else
1424 {
1425 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1426 }
1427 }
1428 pa.Reset();
1429 }
1430
1431 RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1432 }
1433
1434 struct FEDrawChooser
1435 {
1436 typedef PFN_FE_WORK_FUNC FuncType;
1437
1438 template <typename... ArgsB>
1439 static FuncType GetFunc()
1440 {
1441 return ProcessDraw<ArgsB...>;
1442 }
1443 };
1444
1445
1446 // Selector for correct templated Draw front-end function
1447 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1448 bool IsIndexed,
1449 bool IsCutIndexEnabled,
1450 bool HasTessellation,
1451 bool HasGeometryShader,
1452 bool HasStreamOut,
1453 bool HasRasterization)
1454 {
1455 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1456 }
1457
1458 //////////////////////////////////////////////////////////////////////////
1459 /// @brief Processes attributes for the backend based on linkage mask and
1460 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1461 /// @param pDC - Draw context
1462 /// @param pa - Primitive Assembly state
1463 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1464 /// @param pLinkageMap - maps VS attribute slot to PS slot
1465 /// @param triIndex - Triangle to process attributes for
1466 /// @param pBuffer - Output result
1467 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1468 INLINE void ProcessAttributes(
1469 DRAW_CONTEXT *pDC,
1470 PA_STATE&pa,
1471 uint32_t triIndex,
1472 uint32_t primId,
1473 float *pBuffer)
1474 {
1475 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1476 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1477 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1478 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1479 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1480 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1481
1482 static const float constTable[3][4] = {
1483 {0.0f, 0.0f, 0.0f, 0.0f},
1484 {0.0f, 0.0f, 0.0f, 1.0f},
1485 {1.0f, 1.0f, 1.0f, 1.0f}
1486 };
1487
1488 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1489 {
1490 uint32_t inputSlot;
1491 if (IsSwizzledT::value)
1492 {
1493 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1494 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1495
1496 }
1497 else
1498 {
1499 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1500 }
1501
1502 __m128 attrib[3]; // triangle attribs (always 4 wide)
1503 float* pAttribStart = pBuffer;
1504
1505 if (HasConstantInterpT::value || IsDegenerate::value)
1506 {
1507 if (_bittest(&constantInterpMask, i))
1508 {
1509 uint32_t vid;
1510 uint32_t adjustedTriIndex;
1511 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1512 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1513 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1514 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1515 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1516
1517 switch (topo) {
1518 case TOP_QUAD_LIST:
1519 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1520 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1521 break;
1522 case TOP_QUAD_STRIP:
1523 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1524 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1525 break;
1526 case TOP_TRIANGLE_STRIP:
1527 adjustedTriIndex = triIndex;
1528 vid = (triIndex & 1)
1529 ? tristripProvokingVertex[provokingVertex]
1530 : provokingVertex;
1531 break;
1532 default:
1533 adjustedTriIndex = triIndex;
1534 vid = provokingVertex;
1535 break;
1536 }
1537
1538 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1539
1540 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1541 {
1542 _mm_store_ps(pBuffer, attrib[vid]);
1543 pBuffer += 4;
1544 }
1545 }
1546 else
1547 {
1548 pa.AssembleSingle(inputSlot, triIndex, attrib);
1549
1550 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1551 {
1552 _mm_store_ps(pBuffer, attrib[i]);
1553 pBuffer += 4;
1554 }
1555 }
1556 }
1557 else
1558 {
1559 pa.AssembleSingle(inputSlot, triIndex, attrib);
1560
1561 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1562 {
1563 _mm_store_ps(pBuffer, attrib[i]);
1564 pBuffer += 4;
1565 }
1566 }
1567
1568 // pad out the attrib buffer to 3 verts to ensure the triangle
1569 // interpolation code in the pixel shader works correctly for the
1570 // 3 topologies - point, line, tri. This effectively zeros out the
1571 // effect of the missing vertices in the triangle interpolation.
1572 for (uint32_t v = NumVertsT::value; v < 3; ++v)
1573 {
1574 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1575 pBuffer += 4;
1576 }
1577
1578 // check for constant source overrides
1579 if (IsSwizzledT::value)
1580 {
1581 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1582 if (mask)
1583 {
1584 DWORD comp;
1585 while (_BitScanForward(&comp, mask))
1586 {
1587 mask &= ~(1 << comp);
1588
1589 float constantValue = 0.0f;
1590 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1591 {
1592 case SWR_CONSTANT_SOURCE_CONST_0000:
1593 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1594 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1595 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1596 break;
1597 case SWR_CONSTANT_SOURCE_PRIM_ID:
1598 constantValue = *(float*)&primId;
1599 break;
1600 }
1601
1602 // apply constant value to all 3 vertices
1603 for (uint32_t v = 0; v < 3; ++v)
1604 {
1605 pAttribStart[comp + v * 4] = constantValue;
1606 }
1607 }
1608 }
1609 }
1610 }
1611 }
1612
1613
1614 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1615
1616 struct ProcessAttributesChooser
1617 {
1618 typedef PFN_PROCESS_ATTRIBUTES FuncType;
1619
1620 template <typename... ArgsB>
1621 static FuncType GetFunc()
1622 {
1623 return ProcessAttributes<ArgsB...>;
1624 }
1625 };
1626
1627 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1628 {
1629 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1630 }
1631
1632 //////////////////////////////////////////////////////////////////////////
1633 /// @brief Processes enabled user clip distances. Loads the active clip
1634 /// distances from the PA, sets up barycentric equations, and
1635 /// stores the results to the output buffer
1636 /// @param pa - Primitive Assembly state
1637 /// @param primIndex - primitive index to process
1638 /// @param clipDistMask - mask of enabled clip distances
1639 /// @param pUserClipBuffer - buffer to store results
1640 template<uint32_t NumVerts>
1641 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1642 {
1643 DWORD clipDist;
1644 while (_BitScanForward(&clipDist, clipDistMask))
1645 {
1646 clipDistMask &= ~(1 << clipDist);
1647 uint32_t clipSlot = clipDist >> 2;
1648 uint32_t clipComp = clipDist & 0x3;
1649 uint32_t clipAttribSlot = clipSlot == 0 ?
1650 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1651
1652 __m128 primClipDist[3];
1653 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1654
1655 float vertClipDist[NumVerts];
1656 for (uint32_t e = 0; e < NumVerts; ++e)
1657 {
1658 OSALIGNSIMD(float) aVertClipDist[4];
1659 _mm_store_ps(aVertClipDist, primClipDist[e]);
1660 vertClipDist[e] = aVertClipDist[clipComp];
1661 };
1662
1663 // setup plane equations for barycentric interpolation in the backend
1664 float baryCoeff[NumVerts];
1665 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1666 {
1667 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1668 }
1669 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1670
1671 for (uint32_t e = 0; e < NumVerts; ++e)
1672 {
1673 *(pUserClipBuffer++) = baryCoeff[e];
1674 }
1675 }
1676 }
1677
1678 //////////////////////////////////////////////////////////////////////////
1679 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1680 /// Point precision from FP32.
1681 template <typename PT = FixedPointTraits<Fixed_16_8>>
1682 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1683 {
1684 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1685 return _simd_cvtps_epi32(vFixed);
1686 }
1687
1688 //////////////////////////////////////////////////////////////////////////
1689 /// @brief Helper function to set the X,Y coords of a triangle to the
1690 /// requested Fixed Point precision from FP32.
1691 /// @param tri: simdvector[3] of FP triangle verts
1692 /// @param vXi: fixed point X coords of tri verts
1693 /// @param vYi: fixed point Y coords of tri verts
1694 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1695 {
1696 vXi[0] = fpToFixedPointVertical(tri[0].x);
1697 vYi[0] = fpToFixedPointVertical(tri[0].y);
1698 vXi[1] = fpToFixedPointVertical(tri[1].x);
1699 vYi[1] = fpToFixedPointVertical(tri[1].y);
1700 vXi[2] = fpToFixedPointVertical(tri[2].x);
1701 vYi[2] = fpToFixedPointVertical(tri[2].y);
1702 }
1703
1704 //////////////////////////////////////////////////////////////////////////
1705 /// @brief Calculate bounding box for current triangle
1706 /// @tparam CT: ConservativeRastFETraits type
1707 /// @param vX: fixed point X position for triangle verts
1708 /// @param vY: fixed point Y position for triangle verts
1709 /// @param bbox: fixed point bbox
1710 /// *Note*: expects vX, vY to be in the correct precision for the type
1711 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1712 template <typename CT>
1713 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1714 {
1715 simdscalari vMinX = vX[0];
1716 vMinX = _simd_min_epi32(vMinX, vX[1]);
1717 vMinX = _simd_min_epi32(vMinX, vX[2]);
1718
1719 simdscalari vMaxX = vX[0];
1720 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1721 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1722
1723 simdscalari vMinY = vY[0];
1724 vMinY = _simd_min_epi32(vMinY, vY[1]);
1725 vMinY = _simd_min_epi32(vMinY, vY[2]);
1726
1727 simdscalari vMaxY = vY[0];
1728 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1729 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1730
1731 bbox.xmin = vMinX;
1732 bbox.xmax = vMaxX;
1733 bbox.ymin = vMinY;
1734 bbox.ymax = vMaxY;
1735 }
1736
1737 //////////////////////////////////////////////////////////////////////////
1738 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1739 /// Offsets BBox for conservative rast
1740 template <>
1741 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1742 {
1743 // FE conservative rast traits
1744 typedef FEConservativeRastT CT;
1745
1746 simdscalari vMinX = vX[0];
1747 vMinX = _simd_min_epi32(vMinX, vX[1]);
1748 vMinX = _simd_min_epi32(vMinX, vX[2]);
1749
1750 simdscalari vMaxX = vX[0];
1751 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1752 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1753
1754 simdscalari vMinY = vY[0];
1755 vMinY = _simd_min_epi32(vMinY, vY[1]);
1756 vMinY = _simd_min_epi32(vMinY, vY[2]);
1757
1758 simdscalari vMaxY = vY[0];
1759 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1760 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1761
1762 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1763 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1764 bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1765 bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1766 bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1767 bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1768 }
1769
1770 //////////////////////////////////////////////////////////////////////////
1771 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1772 /// culling, viewport transform, etc.
1773 /// @param pDC - pointer to draw context.
1774 /// @param pa - The primitive assembly object.
1775 /// @param workerId - thread's worker id. Even thread has a unique id.
1776 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1777 /// @param primID - Primitive ID for each triangle.
1778 /// @param viewportIdx - viewport array index for each triangle.
1779 /// @tparam CT - ConservativeRastFETraits
1780 template <typename CT>
1781 void BinTriangles(
1782 DRAW_CONTEXT *pDC,
1783 PA_STATE& pa,
1784 uint32_t workerId,
1785 simdvector tri[3],
1786 uint32_t triMask,
1787 simdscalari primID,
1788 simdscalari viewportIdx)
1789 {
1790 RDTSC_START(FEBinTriangles);
1791
1792 const API_STATE& state = GetApiState(pDC);
1793 const SWR_RASTSTATE& rastState = state.rastState;
1794 const SWR_FRONTEND_STATE& feState = state.frontendState;
1795 const SWR_GS_STATE& gsState = state.gsState;
1796 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1797
1798
1799 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1800 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1801 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1802
1803 if (feState.vpTransformDisable)
1804 {
1805 // RHW is passed in directly when VP transform is disabled
1806 vRecipW0 = tri[0].v[3];
1807 vRecipW1 = tri[1].v[3];
1808 vRecipW2 = tri[2].v[3];
1809 }
1810 else
1811 {
1812 // Perspective divide
1813 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1814 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1815 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1816
1817 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1818 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1819 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1820
1821 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1822 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1823 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1824
1825 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1826 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1827 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1828
1829 // Viewport transform to screen space coords
1830 if (state.gsState.emitsViewportArrayIndex)
1831 {
1832 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1833 }
1834 else
1835 {
1836 viewportTransform<3>(tri, state.vpMatrices);
1837 }
1838 }
1839
1840 // Adjust for pixel center location
1841 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1842 tri[0].x = _simd_add_ps(tri[0].x, offset);
1843 tri[0].y = _simd_add_ps(tri[0].y, offset);
1844
1845 tri[1].x = _simd_add_ps(tri[1].x, offset);
1846 tri[1].y = _simd_add_ps(tri[1].y, offset);
1847
1848 tri[2].x = _simd_add_ps(tri[2].x, offset);
1849 tri[2].y = _simd_add_ps(tri[2].y, offset);
1850
1851 simdscalari vXi[3], vYi[3];
1852 // Set vXi, vYi to required fixed point precision
1853 FPToFixedPoint(tri, vXi, vYi);
1854
1855 // triangle setup
1856 simdscalari vAi[3], vBi[3];
1857 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1858
1859 // determinant
1860 simdscalari vDet[2];
1861 calcDeterminantIntVertical(vAi, vBi, vDet);
1862
1863 // cull zero area
1864 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1865 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1866
1867 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1868
1869 uint32_t origTriMask = triMask;
1870 // don't cull degenerate triangles if we're conservatively rasterizing
1871 if(!CT::IsConservativeT::value)
1872 {
1873 triMask &= ~cullZeroAreaMask;
1874 }
1875
1876 // determine front winding tris
1877 // CW +det
1878 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1879 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1880 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1881 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1882
1883 uint32_t frontWindingTris;
1884 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1885 {
1886 frontWindingTris = cwTriMask;
1887 }
1888 else
1889 {
1890 frontWindingTris = ~cwTriMask;
1891 }
1892
1893 // cull
1894 uint32_t cullTris;
1895 switch ((SWR_CULLMODE)rastState.cullMode)
1896 {
1897 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1898 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1899 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1900 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1901 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1902 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1903 }
1904
1905 triMask &= ~cullTris;
1906
1907 if (origTriMask ^ triMask)
1908 {
1909 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1910 }
1911
1912 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1913 // compute per tri backface
1914 uint32_t frontFaceMask = frontWindingTris;
1915 uint32_t *pPrimID = (uint32_t *)&primID;
1916 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1917 DWORD triIndex = 0;
1918 // for center sample pattern, all samples are at pixel center; calculate coverage
1919 // once at center and broadcast the results in the backend
1920 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1921 uint32_t edgeEnable;
1922 PFN_WORK_FUNC pfnWork;
1923 if(CT::IsConservativeT::value)
1924 {
1925 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1926 // used to call the appropriate templated rasterizer function
1927 if(cullZeroAreaMask > 0)
1928 {
1929 // e0 = v1-v0
1930 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1931 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1932 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1933
1934 // e1 = v2-v1
1935 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1936 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1937 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1938
1939 // e2 = v0-v2
1940 // if v0 == v1 & v1 == v2, v0 == v2
1941 uint32_t e2Mask = e0Mask & e1Mask;
1942 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1943
1944 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1945 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1946 e0Mask = pdep_u32(e0Mask, 0x00249249);
1947 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1948 e1Mask = pdep_u32(e1Mask, 0x00492492);
1949 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1950 e2Mask = pdep_u32(e2Mask, 0x00924924);
1951
1952 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1953 }
1954 else
1955 {
1956 edgeEnable = 0x00FFFFFF;
1957 }
1958 }
1959 else
1960 {
1961 // degenerate triangles won't be sent to rasterizer; just enable all edges
1962 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1963 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1964 (rastState.scissorEnable > 0));
1965 }
1966
1967 if (!triMask)
1968 {
1969 goto endBinTriangles;
1970 }
1971
1972 // Calc bounding box of triangles
1973 simdBBox bbox;
1974 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1975
1976 // determine if triangle falls between pixel centers and discard
1977 // only discard for non-MSAA case and when conservative rast is disabled
1978 // (xmin + 127) & ~255
1979 // (xmax + 128) & ~255
1980 if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1981 {
1982 origTriMask = triMask;
1983
1984 int cullCenterMask;
1985 {
1986 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
1987 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
1988 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
1989 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
1990
1991 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
1992
1993 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
1994 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
1995 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
1996 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
1997
1998 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
1999 vMaskV = _simd_or_si(vMaskH, vMaskV);
2000 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
2001 }
2002
2003 triMask &= ~cullCenterMask;
2004
2005 if(origTriMask ^ triMask)
2006 {
2007 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
2008 }
2009 }
2010
2011 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2012 // Gather the AOS effective scissor rects based on the per-prim VP index.
2013 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2014 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2015 if (state.gsState.emitsViewportArrayIndex)
2016 {
2017 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2018 scisXmin, scisYmin, scisXmax, scisYmax);
2019 }
2020 else // broadcast fast path for non-VPAI case.
2021 {
2022 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2023 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2024 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2025 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2026 }
2027
2028 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2029 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2030 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2031 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2032
2033 if(CT::IsConservativeT::value)
2034 {
2035 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
2036 // some area. Bump the xmax/ymax edges out
2037 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
2038 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
2039 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
2040 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
2041 }
2042
2043 // Cull tris completely outside scissor
2044 {
2045 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2046 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2047 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2048 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2049 triMask = triMask & ~maskOutsideScissor;
2050 }
2051
2052 if (!triMask)
2053 {
2054 goto endBinTriangles;
2055 }
2056
2057 // Convert triangle bbox to macrotile units.
2058 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2059 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2060 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2061 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2062
2063 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2064 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2065 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2066 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2067 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2068
2069 // transpose verts needed for backend
2070 /// @todo modify BE to take non-transformed verts
2071 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2072 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2073 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2074 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2075 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2076
2077 // store render target array index
2078 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2079 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2080 {
2081 simdvector vRtai[3];
2082 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2083 simdscalari vRtaii;
2084 vRtaii = _simd_castps_si(vRtai[0].x);
2085 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2086 }
2087 else
2088 {
2089 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2090 }
2091
2092 // scan remaining valid triangles and bin each separately
2093 while (_BitScanForward(&triIndex, triMask))
2094 {
2095 uint32_t linkageCount = state.backendState.numAttributes;
2096 uint32_t numScalarAttribs = linkageCount * 4;
2097
2098 BE_WORK work;
2099 work.type = DRAW;
2100
2101 bool isDegenerate;
2102 if(CT::IsConservativeT::value)
2103 {
2104 // only rasterize valid edges if we have a degenerate primitive
2105 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2106 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2107 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2108 (rastState.scissorEnable > 0));
2109
2110 // Degenerate triangles are required to be constant interpolated
2111 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2112 }
2113 else
2114 {
2115 isDegenerate = false;
2116 work.pfnWork = pfnWork;
2117 }
2118
2119 // Select attribute processor
2120 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2121 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
2122
2123 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2124
2125 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2126 desc.triFlags.primID = pPrimID[triIndex];
2127 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2128 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
2129
2130 auto pArena = pDC->pArena;
2131 SWR_ASSERT(pArena != nullptr);
2132
2133 // store active attribs
2134 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2135 desc.pAttribs = pAttribs;
2136 desc.numAttribs = linkageCount;
2137 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2138
2139 // store triangle vertex data
2140 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2141
2142 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2143 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2144 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2145 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2146
2147 // store user clip distances
2148 if (rastState.clipDistanceMask)
2149 {
2150 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2151 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2152 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2153 }
2154
2155 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2156 {
2157 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2158 {
2159 #if KNOB_ENABLE_TOSS_POINTS
2160 if (!KNOB_TOSS_SETUP_TRIS)
2161 #endif
2162 {
2163 pTileMgr->enqueue(x, y, &work);
2164 }
2165 }
2166 }
2167 triMask &= ~(1 << triIndex);
2168 }
2169
2170 endBinTriangles:
2171 RDTSC_STOP(FEBinTriangles, 1, 0);
2172 }
2173
2174 struct FEBinTrianglesChooser
2175 {
2176 typedef PFN_PROCESS_PRIMS FuncType;
2177
2178 template <typename... ArgsB>
2179 static FuncType GetFunc()
2180 {
2181 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2182 }
2183 };
2184
2185 // Selector for correct templated BinTrinagles function
2186 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2187 {
2188 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2189 }
2190
2191 //////////////////////////////////////////////////////////////////////////
2192 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2193 /// @param pDC - pointer to draw context.
2194 /// @param pa - The primitive assembly object.
2195 /// @param workerId - thread's worker id. Even thread has a unique id.
2196 /// @param tri - Contains point position data for SIMDs worth of points.
2197 /// @param primID - Primitive ID for each point.
2198 void BinPoints(
2199 DRAW_CONTEXT *pDC,
2200 PA_STATE& pa,
2201 uint32_t workerId,
2202 simdvector prim[3],
2203 uint32_t primMask,
2204 simdscalari primID,
2205 simdscalari viewportIdx)
2206 {
2207 RDTSC_START(FEBinPoints);
2208
2209 simdvector& primVerts = prim[0];
2210
2211 const API_STATE& state = GetApiState(pDC);
2212 const SWR_FRONTEND_STATE& feState = state.frontendState;
2213 const SWR_GS_STATE& gsState = state.gsState;
2214 const SWR_RASTSTATE& rastState = state.rastState;
2215 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2216
2217 // Select attribute processor
2218 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2219 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2220
2221 if (!feState.vpTransformDisable)
2222 {
2223 // perspective divide
2224 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2225 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2226 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2227 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2228
2229 // viewport transform to screen coords
2230 if (state.gsState.emitsViewportArrayIndex)
2231 {
2232 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2233 }
2234 else
2235 {
2236 viewportTransform<1>(&primVerts, state.vpMatrices);
2237 }
2238 }
2239
2240 // adjust for pixel center location
2241 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2242 primVerts.x = _simd_add_ps(primVerts.x, offset);
2243 primVerts.y = _simd_add_ps(primVerts.y, offset);
2244
2245 // convert to fixed point
2246 simdscalari vXi, vYi;
2247 vXi = fpToFixedPointVertical(primVerts.x);
2248 vYi = fpToFixedPointVertical(primVerts.y);
2249
2250 if (CanUseSimplePoints(pDC))
2251 {
2252 // adjust for ymin-xmin rule
2253 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2254 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2255
2256 // cull points off the ymin-xmin edge of the viewport
2257 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2258 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2259
2260 // compute macro tile coordinates
2261 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2262 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2263
2264 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2265 _simd_store_si((simdscalari*)aMacroX, macroX);
2266 _simd_store_si((simdscalari*)aMacroY, macroY);
2267
2268 // compute raster tile coordinates
2269 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2270 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2271
2272 // compute raster tile relative x,y for coverage mask
2273 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2274 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2275
2276 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2277 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2278
2279 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2280 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2281 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2282 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2283
2284 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2285 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2286 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2287 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2288
2289 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2290 _simd_store_ps((float*)aZ, primVerts.z);
2291
2292 // store render target array index
2293 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2294 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2295 {
2296 simdvector vRtai;
2297 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2298 simdscalari vRtaii = _simd_castps_si(vRtai.x);
2299 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2300 }
2301 else
2302 {
2303 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2304 }
2305
2306 uint32_t *pPrimID = (uint32_t *)&primID;
2307 DWORD primIndex = 0;
2308
2309 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2310
2311 // scan remaining valid triangles and bin each separately
2312 while (_BitScanForward(&primIndex, primMask))
2313 {
2314 uint32_t linkageCount = backendState.numAttributes;
2315 uint32_t numScalarAttribs = linkageCount * 4;
2316
2317 BE_WORK work;
2318 work.type = DRAW;
2319
2320 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2321
2322 // points are always front facing
2323 desc.triFlags.frontFacing = 1;
2324 desc.triFlags.primID = pPrimID[primIndex];
2325 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2326 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2327
2328 work.pfnWork = RasterizeSimplePoint;
2329
2330 auto pArena = pDC->pArena;
2331 SWR_ASSERT(pArena != nullptr);
2332
2333 // store attributes
2334 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2335 desc.pAttribs = pAttribs;
2336 desc.numAttribs = linkageCount;
2337
2338 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2339
2340 // store raster tile aligned x, y, perspective correct z
2341 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2342 desc.pTriBuffer = pTriBuffer;
2343 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2344 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2345 *pTriBuffer = aZ[primIndex];
2346
2347 uint32_t tX = aTileRelativeX[primIndex];
2348 uint32_t tY = aTileRelativeY[primIndex];
2349
2350 // pack the relative x,y into the coverageMask, the rasterizer will
2351 // generate the true coverage mask from it
2352 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2353
2354 // bin it
2355 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2356 #if KNOB_ENABLE_TOSS_POINTS
2357 if (!KNOB_TOSS_SETUP_TRIS)
2358 #endif
2359 {
2360 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2361 }
2362 primMask &= ~(1 << primIndex);
2363 }
2364 }
2365 else
2366 {
2367 // non simple points need to be potentially binned to multiple macro tiles
2368 simdscalar vPointSize;
2369 if (rastState.pointParam)
2370 {
2371 simdvector size[3];
2372 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2373 vPointSize = size[0].x;
2374 }
2375 else
2376 {
2377 vPointSize = _simd_set1_ps(rastState.pointSize);
2378 }
2379
2380 // bloat point to bbox
2381 simdBBox bbox;
2382 bbox.xmin = bbox.xmax = vXi;
2383 bbox.ymin = bbox.ymax = vYi;
2384
2385 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2386 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2387 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2388 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2389 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2390 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2391
2392 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2393 // Gather the AOS effective scissor rects based on the per-prim VP index.
2394 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2395 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2396 if (state.gsState.emitsViewportArrayIndex)
2397 {
2398 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2399 scisXmin, scisYmin, scisXmax, scisYmax);
2400 }
2401 else // broadcast fast path for non-VPAI case.
2402 {
2403 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2404 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2405 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2406 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2407 }
2408
2409 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2410 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2411 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2412 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2413
2414 // Cull bloated points completely outside scissor
2415 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2416 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2417 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2418 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2419 primMask = primMask & ~maskOutsideScissor;
2420
2421 // Convert bbox to macrotile units.
2422 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2423 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2424 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2425 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2426
2427 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2428 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2429 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2430 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2431 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2432
2433 // store render target array index
2434 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2435 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2436 {
2437 simdvector vRtai[2];
2438 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2439 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2440 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2441 }
2442 else
2443 {
2444 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2445 }
2446
2447 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2448 _simd_store_ps((float*)aPointSize, vPointSize);
2449
2450 uint32_t *pPrimID = (uint32_t *)&primID;
2451
2452 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2453 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2454 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2455
2456 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2457 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2458 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2459
2460 // scan remaining valid prims and bin each separately
2461 const SWR_BACKEND_STATE& backendState = state.backendState;
2462 DWORD primIndex;
2463 while (_BitScanForward(&primIndex, primMask))
2464 {
2465 uint32_t linkageCount = backendState.numAttributes;
2466 uint32_t numScalarAttribs = linkageCount * 4;
2467
2468 BE_WORK work;
2469 work.type = DRAW;
2470
2471 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2472
2473 desc.triFlags.frontFacing = 1;
2474 desc.triFlags.primID = pPrimID[primIndex];
2475 desc.triFlags.pointSize = aPointSize[primIndex];
2476 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2477 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2478
2479 work.pfnWork = RasterizeTriPoint;
2480
2481 auto pArena = pDC->pArena;
2482 SWR_ASSERT(pArena != nullptr);
2483
2484 // store active attribs
2485 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2486 desc.numAttribs = linkageCount;
2487 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2488
2489 // store point vertex data
2490 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2491 desc.pTriBuffer = pTriBuffer;
2492 *pTriBuffer++ = aPrimVertsX[primIndex];
2493 *pTriBuffer++ = aPrimVertsY[primIndex];
2494 *pTriBuffer = aPrimVertsZ[primIndex];
2495
2496 // store user clip distances
2497 if (rastState.clipDistanceMask)
2498 {
2499 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2500 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2501 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2502 }
2503
2504 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2505 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2506 {
2507 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2508 {
2509 #if KNOB_ENABLE_TOSS_POINTS
2510 if (!KNOB_TOSS_SETUP_TRIS)
2511 #endif
2512 {
2513 pTileMgr->enqueue(x, y, &work);
2514 }
2515 }
2516 }
2517
2518 primMask &= ~(1 << primIndex);
2519 }
2520 }
2521
2522
2523
2524
2525 RDTSC_STOP(FEBinPoints, 1, 0);
2526 }
2527
2528 //////////////////////////////////////////////////////////////////////////
2529 /// @brief Bin SIMD lines to the backend.
2530 /// @param pDC - pointer to draw context.
2531 /// @param pa - The primitive assembly object.
2532 /// @param workerId - thread's worker id. Even thread has a unique id.
2533 /// @param tri - Contains line position data for SIMDs worth of points.
2534 /// @param primID - Primitive ID for each line.
2535 /// @param viewportIdx - Viewport Array Index for each line.
2536 void BinLines(
2537 DRAW_CONTEXT *pDC,
2538 PA_STATE& pa,
2539 uint32_t workerId,
2540 simdvector prim[],
2541 uint32_t primMask,
2542 simdscalari primID,
2543 simdscalari viewportIdx)
2544 {
2545 RDTSC_START(FEBinLines);
2546
2547 const API_STATE& state = GetApiState(pDC);
2548 const SWR_RASTSTATE& rastState = state.rastState;
2549 const SWR_FRONTEND_STATE& feState = state.frontendState;
2550 const SWR_GS_STATE& gsState = state.gsState;
2551
2552 // Select attribute processor
2553 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2554 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2555
2556 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2557 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2558
2559 if (!feState.vpTransformDisable)
2560 {
2561 // perspective divide
2562 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2563 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2564
2565 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2566 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2567
2568 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2569 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2570
2571 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2572 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2573
2574 // viewport transform to screen coords
2575 if (state.gsState.emitsViewportArrayIndex)
2576 {
2577 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2578 }
2579 else
2580 {
2581 viewportTransform<2>(prim, state.vpMatrices);
2582 }
2583 }
2584
2585 // adjust for pixel center location
2586 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2587 prim[0].x = _simd_add_ps(prim[0].x, offset);
2588 prim[0].y = _simd_add_ps(prim[0].y, offset);
2589
2590 prim[1].x = _simd_add_ps(prim[1].x, offset);
2591 prim[1].y = _simd_add_ps(prim[1].y, offset);
2592
2593 // convert to fixed point
2594 simdscalari vXi[2], vYi[2];
2595 vXi[0] = fpToFixedPointVertical(prim[0].x);
2596 vYi[0] = fpToFixedPointVertical(prim[0].y);
2597 vXi[1] = fpToFixedPointVertical(prim[1].x);
2598 vYi[1] = fpToFixedPointVertical(prim[1].y);
2599
2600 // compute x-major vs y-major mask
2601 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2602 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2603 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2604 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2605
2606 // cull zero-length lines
2607 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2608 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2609
2610 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2611
2612 uint32_t *pPrimID = (uint32_t *)&primID;
2613 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2614
2615 simdscalar vUnused = _simd_setzero_ps();
2616
2617 // Calc bounding box of lines
2618 simdBBox bbox;
2619 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2620 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2621 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2622 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2623
2624 // bloat bbox by line width along minor axis
2625 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2626 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2627 simdBBox bloatBox;
2628 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2629 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2630 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2631 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2632
2633 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2634 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2635 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2636 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2637
2638 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2639 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2640 if (state.gsState.emitsViewportArrayIndex)
2641 {
2642 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2643 scisXmin, scisYmin, scisXmax, scisYmax);
2644 }
2645 else // broadcast fast path for non-VPAI case.
2646 {
2647 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2648 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2649 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2650 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2651 }
2652
2653 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2654 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2655 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2656 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2657
2658 // Cull prims completely outside scissor
2659 {
2660 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2661 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2662 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2663 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2664 primMask = primMask & ~maskOutsideScissor;
2665 }
2666
2667 if (!primMask)
2668 {
2669 goto endBinLines;
2670 }
2671
2672 // Convert triangle bbox to macrotile units.
2673 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2674 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2675 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2676 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2677
2678 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2679 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2680 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2681 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2682 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2683
2684 // transpose verts needed for backend
2685 /// @todo modify BE to take non-transformed verts
2686 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2687 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2688 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2689 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2690 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2691
2692 // store render target array index
2693 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2694 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2695 {
2696 simdvector vRtai[2];
2697 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2698 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2699 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2700 }
2701 else
2702 {
2703 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2704 }
2705
2706 // scan remaining valid prims and bin each separately
2707 DWORD primIndex;
2708 while (_BitScanForward(&primIndex, primMask))
2709 {
2710 uint32_t linkageCount = state.backendState.numAttributes;
2711 uint32_t numScalarAttribs = linkageCount * 4;
2712
2713 BE_WORK work;
2714 work.type = DRAW;
2715
2716 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2717
2718 desc.triFlags.frontFacing = 1;
2719 desc.triFlags.primID = pPrimID[primIndex];
2720 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2721 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2722 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2723
2724 work.pfnWork = RasterizeLine;
2725
2726 auto pArena = pDC->pArena;
2727 SWR_ASSERT(pArena != nullptr);
2728
2729 // store active attribs
2730 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2731 desc.numAttribs = linkageCount;
2732 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2733
2734 // store line vertex data
2735 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2736 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2737 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2738 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2739 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2740
2741 // store user clip distances
2742 if (rastState.clipDistanceMask)
2743 {
2744 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2745 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2746 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2747 }
2748
2749 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2750 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2751 {
2752 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2753 {
2754 #if KNOB_ENABLE_TOSS_POINTS
2755 if (!KNOB_TOSS_SETUP_TRIS)
2756 #endif
2757 {
2758 pTileMgr->enqueue(x, y, &work);
2759 }
2760 }
2761 }
2762
2763 primMask &= ~(1 << primIndex);
2764 }
2765
2766 endBinLines:
2767
2768 RDTSC_STOP(FEBinLines, 1, 0);
2769 }