swr: [rasterizer core] remove all old stats code
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
37 #include "utils.h"
38 #include "threads.h"
39 #include "pa.h"
40 #include "clip.h"
41 #include "tilemgr.h"
42 #include "tessellator.h"
43 #include <limits>
44
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE uint32_t GenMask(uint32_t numBits)
48 {
49 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
55 /// raster state.
56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
69 void ProcessSync(
70 SWR_CONTEXT *pContext,
71 DRAW_CONTEXT *pDC,
72 uint32_t workerId,
73 void *pUserData)
74 {
75 BE_WORK work;
76 work.type = SYNC;
77 work.pfnWork = ProcessSyncBE;
78
79 MacroTileMgr *pTileMgr = pDC->pTileMgr;
80 pTileMgr->enqueue(0, 0, &work);
81 }
82
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrClearRenderTarget.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to clear callback.
89 /// @todo This should go away when we switch this to use compute threading.
90 void ProcessClear(
91 SWR_CONTEXT *pContext,
92 DRAW_CONTEXT *pDC,
93 uint32_t workerId,
94 void *pUserData)
95 {
96 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
97 MacroTileMgr *pTileMgr = pDC->pTileMgr;
98
99 const API_STATE& state = GetApiState(pDC);
100
101 // queue a clear to each macro tile
102 // compute macro tile bounds for the current scissor/viewport
103 uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
104 uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
105 uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
106 uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
107
108 BE_WORK work;
109 work.type = CLEAR;
110 work.pfnWork = ProcessClearBE;
111 work.desc.clear = *pClear;
112
113 for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
114 {
115 for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
116 {
117 pTileMgr->enqueue(x, y, &work);
118 }
119 }
120 }
121
122 //////////////////////////////////////////////////////////////////////////
123 /// @brief FE handler for SwrStoreTiles.
124 /// @param pContext - pointer to SWR context.
125 /// @param pDC - pointer to draw context.
126 /// @param workerId - thread's worker id. Even thread has a unique id.
127 /// @param pUserData - Pointer to user data passed back to callback.
128 /// @todo This should go away when we switch this to use compute threading.
129 void ProcessStoreTiles(
130 SWR_CONTEXT *pContext,
131 DRAW_CONTEXT *pDC,
132 uint32_t workerId,
133 void *pUserData)
134 {
135 RDTSC_START(FEProcessStoreTiles);
136 STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
137 MacroTileMgr *pTileMgr = pDC->pTileMgr;
138
139 const API_STATE& state = GetApiState(pDC);
140
141 // queue a store to each macro tile
142 // compute macro tile bounds for the current render target
143 const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
144 const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
145
146 uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
147 uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
148
149 // store tiles
150 BE_WORK work;
151 work.type = STORETILES;
152 work.pfnWork = ProcessStoreTileBE;
153 work.desc.storeTiles = *pStore;
154
155 for (uint32_t x = 0; x < numMacroTilesX; ++x)
156 {
157 for (uint32_t y = 0; y < numMacroTilesY; ++y)
158 {
159 pTileMgr->enqueue(x, y, &work);
160 }
161 }
162
163 RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
164 }
165
166 //////////////////////////////////////////////////////////////////////////
167 /// @brief FE handler for SwrInvalidateTiles.
168 /// @param pContext - pointer to SWR context.
169 /// @param pDC - pointer to draw context.
170 /// @param workerId - thread's worker id. Even thread has a unique id.
171 /// @param pUserData - Pointer to user data passed back to callback.
172 /// @todo This should go away when we switch this to use compute threading.
173 void ProcessDiscardInvalidateTiles(
174 SWR_CONTEXT *pContext,
175 DRAW_CONTEXT *pDC,
176 uint32_t workerId,
177 void *pUserData)
178 {
179 RDTSC_START(FEProcessInvalidateTiles);
180 DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
181 MacroTileMgr *pTileMgr = pDC->pTileMgr;
182
183 SWR_RECT rect;
184
185 if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
186 {
187 // Valid rect
188 rect = pInv->rect;
189 }
190 else
191 {
192 // Use viewport dimensions
193 const API_STATE& state = GetApiState(pDC);
194
195 rect.left = (uint32_t)state.vp[0].x;
196 rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
197 rect.top = (uint32_t)state.vp[0].y;
198 rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
199 }
200
201 // queue a store to each macro tile
202 // compute macro tile bounds for the current render target
203 uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
204 uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
205
206 // Setup region assuming full tiles
207 uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
208 uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
209
210 uint32_t macroTileEndX = rect.right / macroWidth;
211 uint32_t macroTileEndY = rect.bottom / macroHeight;
212
213 if (pInv->fullTilesOnly == false)
214 {
215 // include partial tiles
216 macroTileStartX = rect.left / macroWidth;
217 macroTileStartY = rect.top / macroHeight;
218
219 macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
220 macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
221 }
222
223 SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
224 SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
225
226 macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
227 macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
228
229 // load tiles
230 BE_WORK work;
231 work.type = DISCARDINVALIDATETILES;
232 work.pfnWork = ProcessDiscardInvalidateTilesBE;
233 work.desc.discardInvalidateTiles = *pInv;
234
235 for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
236 {
237 for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
238 {
239 pTileMgr->enqueue(x, y, &work);
240 }
241 }
242
243 RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
244 }
245
246 //////////////////////////////////////////////////////////////////////////
247 /// @brief Computes the number of primitives given the number of verts.
248 /// @param mode - primitive topology for draw operation.
249 /// @param numPrims - number of vertices or indices for draw.
250 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
251 uint32_t GetNumPrims(
252 PRIMITIVE_TOPOLOGY mode,
253 uint32_t numPrims)
254 {
255 switch (mode)
256 {
257 case TOP_POINT_LIST: return numPrims;
258 case TOP_TRIANGLE_LIST: return numPrims / 3;
259 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
260 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
261 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
262 case TOP_QUAD_LIST: return numPrims / 4;
263 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
264 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
265 case TOP_LINE_LIST: return numPrims / 2;
266 case TOP_LINE_LOOP: return numPrims;
267 case TOP_RECT_LIST: return numPrims / 3;
268 case TOP_LINE_LIST_ADJ: return numPrims / 4;
269 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
270 case TOP_TRI_LIST_ADJ: return numPrims / 6;
271 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
272
273 case TOP_PATCHLIST_1:
274 case TOP_PATCHLIST_2:
275 case TOP_PATCHLIST_3:
276 case TOP_PATCHLIST_4:
277 case TOP_PATCHLIST_5:
278 case TOP_PATCHLIST_6:
279 case TOP_PATCHLIST_7:
280 case TOP_PATCHLIST_8:
281 case TOP_PATCHLIST_9:
282 case TOP_PATCHLIST_10:
283 case TOP_PATCHLIST_11:
284 case TOP_PATCHLIST_12:
285 case TOP_PATCHLIST_13:
286 case TOP_PATCHLIST_14:
287 case TOP_PATCHLIST_15:
288 case TOP_PATCHLIST_16:
289 case TOP_PATCHLIST_17:
290 case TOP_PATCHLIST_18:
291 case TOP_PATCHLIST_19:
292 case TOP_PATCHLIST_20:
293 case TOP_PATCHLIST_21:
294 case TOP_PATCHLIST_22:
295 case TOP_PATCHLIST_23:
296 case TOP_PATCHLIST_24:
297 case TOP_PATCHLIST_25:
298 case TOP_PATCHLIST_26:
299 case TOP_PATCHLIST_27:
300 case TOP_PATCHLIST_28:
301 case TOP_PATCHLIST_29:
302 case TOP_PATCHLIST_30:
303 case TOP_PATCHLIST_31:
304 case TOP_PATCHLIST_32:
305 return numPrims / (mode - TOP_PATCHLIST_BASE);
306
307 case TOP_POLYGON:
308 case TOP_POINT_LIST_BF:
309 case TOP_LINE_STRIP_CONT:
310 case TOP_LINE_STRIP_BF:
311 case TOP_LINE_STRIP_CONT_BF:
312 case TOP_TRIANGLE_FAN_NOSTIPPLE:
313 case TOP_TRI_STRIP_REVERSE:
314 case TOP_PATCHLIST_BASE:
315 case TOP_UNKNOWN:
316 SWR_ASSERT(false, "Unsupported topology: %d", mode);
317 return 0;
318 }
319
320 return 0;
321 }
322
323 //////////////////////////////////////////////////////////////////////////
324 /// @brief Computes the number of verts given the number of primitives.
325 /// @param mode - primitive topology for draw operation.
326 /// @param numPrims - number of primitives for draw.
327 uint32_t GetNumVerts(
328 PRIMITIVE_TOPOLOGY mode,
329 uint32_t numPrims)
330 {
331 switch (mode)
332 {
333 case TOP_POINT_LIST: return numPrims;
334 case TOP_TRIANGLE_LIST: return numPrims * 3;
335 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
336 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
337 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
338 case TOP_QUAD_LIST: return numPrims * 4;
339 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
340 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
341 case TOP_LINE_LIST: return numPrims * 2;
342 case TOP_LINE_LOOP: return numPrims;
343 case TOP_RECT_LIST: return numPrims * 3;
344 case TOP_LINE_LIST_ADJ: return numPrims * 4;
345 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
346 case TOP_TRI_LIST_ADJ: return numPrims * 6;
347 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
348
349 case TOP_PATCHLIST_1:
350 case TOP_PATCHLIST_2:
351 case TOP_PATCHLIST_3:
352 case TOP_PATCHLIST_4:
353 case TOP_PATCHLIST_5:
354 case TOP_PATCHLIST_6:
355 case TOP_PATCHLIST_7:
356 case TOP_PATCHLIST_8:
357 case TOP_PATCHLIST_9:
358 case TOP_PATCHLIST_10:
359 case TOP_PATCHLIST_11:
360 case TOP_PATCHLIST_12:
361 case TOP_PATCHLIST_13:
362 case TOP_PATCHLIST_14:
363 case TOP_PATCHLIST_15:
364 case TOP_PATCHLIST_16:
365 case TOP_PATCHLIST_17:
366 case TOP_PATCHLIST_18:
367 case TOP_PATCHLIST_19:
368 case TOP_PATCHLIST_20:
369 case TOP_PATCHLIST_21:
370 case TOP_PATCHLIST_22:
371 case TOP_PATCHLIST_23:
372 case TOP_PATCHLIST_24:
373 case TOP_PATCHLIST_25:
374 case TOP_PATCHLIST_26:
375 case TOP_PATCHLIST_27:
376 case TOP_PATCHLIST_28:
377 case TOP_PATCHLIST_29:
378 case TOP_PATCHLIST_30:
379 case TOP_PATCHLIST_31:
380 case TOP_PATCHLIST_32:
381 return numPrims * (mode - TOP_PATCHLIST_BASE);
382
383 case TOP_POLYGON:
384 case TOP_POINT_LIST_BF:
385 case TOP_LINE_STRIP_CONT:
386 case TOP_LINE_STRIP_BF:
387 case TOP_LINE_STRIP_CONT_BF:
388 case TOP_TRIANGLE_FAN_NOSTIPPLE:
389 case TOP_TRI_STRIP_REVERSE:
390 case TOP_PATCHLIST_BASE:
391 case TOP_UNKNOWN:
392 SWR_ASSERT(false, "Unsupported topology: %d", mode);
393 return 0;
394 }
395
396 return 0;
397 }
398
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Return number of verts per primitive.
401 /// @param topology - topology
402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
403 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
404 {
405 uint32_t numVerts = 0;
406 switch (topology)
407 {
408 case TOP_POINT_LIST:
409 case TOP_POINT_LIST_BF:
410 numVerts = 1;
411 break;
412 case TOP_LINE_LIST:
413 case TOP_LINE_STRIP:
414 case TOP_LINE_LIST_ADJ:
415 case TOP_LINE_LOOP:
416 case TOP_LINE_STRIP_CONT:
417 case TOP_LINE_STRIP_BF:
418 case TOP_LISTSTRIP_ADJ:
419 numVerts = 2;
420 break;
421 case TOP_TRIANGLE_LIST:
422 case TOP_TRIANGLE_STRIP:
423 case TOP_TRIANGLE_FAN:
424 case TOP_TRI_LIST_ADJ:
425 case TOP_TRI_STRIP_ADJ:
426 case TOP_TRI_STRIP_REVERSE:
427 case TOP_RECT_LIST:
428 numVerts = 3;
429 break;
430 case TOP_QUAD_LIST:
431 case TOP_QUAD_STRIP:
432 numVerts = 4;
433 break;
434 case TOP_PATCHLIST_1:
435 case TOP_PATCHLIST_2:
436 case TOP_PATCHLIST_3:
437 case TOP_PATCHLIST_4:
438 case TOP_PATCHLIST_5:
439 case TOP_PATCHLIST_6:
440 case TOP_PATCHLIST_7:
441 case TOP_PATCHLIST_8:
442 case TOP_PATCHLIST_9:
443 case TOP_PATCHLIST_10:
444 case TOP_PATCHLIST_11:
445 case TOP_PATCHLIST_12:
446 case TOP_PATCHLIST_13:
447 case TOP_PATCHLIST_14:
448 case TOP_PATCHLIST_15:
449 case TOP_PATCHLIST_16:
450 case TOP_PATCHLIST_17:
451 case TOP_PATCHLIST_18:
452 case TOP_PATCHLIST_19:
453 case TOP_PATCHLIST_20:
454 case TOP_PATCHLIST_21:
455 case TOP_PATCHLIST_22:
456 case TOP_PATCHLIST_23:
457 case TOP_PATCHLIST_24:
458 case TOP_PATCHLIST_25:
459 case TOP_PATCHLIST_26:
460 case TOP_PATCHLIST_27:
461 case TOP_PATCHLIST_28:
462 case TOP_PATCHLIST_29:
463 case TOP_PATCHLIST_30:
464 case TOP_PATCHLIST_31:
465 case TOP_PATCHLIST_32:
466 numVerts = topology - TOP_PATCHLIST_BASE;
467 break;
468 default:
469 SWR_ASSERT(false, "Unsupported topology: %d", topology);
470 break;
471 }
472
473 if (includeAdjVerts)
474 {
475 switch (topology)
476 {
477 case TOP_LISTSTRIP_ADJ:
478 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
479 case TOP_TRI_STRIP_ADJ:
480 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
481 default: break;
482 }
483 }
484
485 return numVerts;
486 }
487
488 //////////////////////////////////////////////////////////////////////////
489 /// @brief Generate mask from remaining work.
490 /// @param numWorkItems - Number of items being worked on by a SIMD.
491 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
492 {
493 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
494 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
495 return _simd_castps_si(vMask(mask));
496 }
497
498 //////////////////////////////////////////////////////////////////////////
499 /// @brief StreamOut - Streams vertex data out to SO buffers.
500 /// Generally, we are only streaming out a SIMDs worth of triangles.
501 /// @param pDC - pointer to draw context.
502 /// @param workerId - thread's worker id. Even thread has a unique id.
503 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
504 static void StreamOut(
505 DRAW_CONTEXT* pDC,
506 PA_STATE& pa,
507 uint32_t workerId,
508 uint32_t* pPrimData,
509 uint32_t streamIndex)
510 {
511 RDTSC_START(FEStreamout);
512
513 SWR_CONTEXT* pContext = pDC->pContext;
514
515 const API_STATE& state = GetApiState(pDC);
516 const SWR_STREAMOUT_STATE &soState = state.soState;
517
518 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
519
520 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
521 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
522
523 SWR_STREAMOUT_CONTEXT soContext = { 0 };
524
525 // Setup buffer state pointers.
526 for (uint32_t i = 0; i < 4; ++i)
527 {
528 soContext.pBuffer[i] = &state.soBuffer[i];
529 }
530
531 uint32_t numPrims = pa.NumPrims();
532 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
533 {
534 DWORD slot = 0;
535 uint32_t soMask = soState.streamMasks[streamIndex];
536
537 // Write all entries into primitive data buffer for SOS.
538 while (_BitScanForward(&slot, soMask))
539 {
540 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
541 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
542 pa.AssembleSingle(paSlot, primIndex, attrib);
543
544 // Attribute offset is relative offset from start of vertex.
545 // Note that attributes start at slot 1 in the PA buffer. We need to write this
546 // to prim data starting at slot 0. Which is why we do (slot - 1).
547 // Also note: GL works slightly differently, and needs slot 0
548 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
549
550 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
551 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
552 {
553 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
554
555 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
556 }
557 soMask &= ~(1 << slot);
558 }
559
560 // Update pPrimData pointer
561 soContext.pPrimData = pPrimData;
562
563 // Call SOS
564 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
565 state.pfnSoFunc[streamIndex](soContext);
566 }
567
568 // Update SO write offset. The driver provides memory for the update.
569 for (uint32_t i = 0; i < 4; ++i)
570 {
571 if (state.soBuffer[i].pWriteOffset)
572 {
573 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
574 }
575
576 if (state.soBuffer[i].soWriteEnable)
577 {
578 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
579 pDC->dynState.SoWriteOffsetDirty[i] = true;
580 }
581 }
582
583 UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
584 UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
585
586 RDTSC_STOP(FEStreamout, 1, 0);
587 }
588
589 //////////////////////////////////////////////////////////////////////////
590 /// @brief Computes number of invocations. The current index represents
591 /// the start of the SIMD. The max index represents how much work
592 /// items are remaining. If there is less then a SIMD's left of work
593 /// then return the remaining amount of work.
594 /// @param curIndex - The start index for the SIMD.
595 /// @param maxIndex - The last index for all work items.
596 static INLINE uint32_t GetNumInvocations(
597 uint32_t curIndex,
598 uint32_t maxIndex)
599 {
600 uint32_t remainder = (maxIndex - curIndex);
601 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
602 }
603
604 //////////////////////////////////////////////////////////////////////////
605 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
606 /// The geometry shader will loop over each active streamout buffer, assembling
607 /// primitives for the downstream stages. When multistream output is enabled,
608 /// the generated stream ID buffer from the GS needs to be converted to a cut
609 /// buffer for the primitive assembler.
610 /// @param stream - stream id to generate the cut buffer for
611 /// @param pStreamIdBase - pointer to the stream ID buffer
612 /// @param numEmittedVerts - Number of total verts emitted by the GS
613 /// @param pCutBuffer - output buffer to write cuts to
614 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
615 {
616 SWR_ASSERT(stream < MAX_SO_STREAMS);
617
618 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
619 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
620
621 for (uint32_t b = 0; b < numOutputBytes; ++b)
622 {
623 uint8_t curInputByte = pStreamIdBase[2*b];
624 uint8_t outByte = 0;
625 for (uint32_t i = 0; i < 4; ++i)
626 {
627 if ((curInputByte & 0x3) != stream)
628 {
629 outByte |= (1 << i);
630 }
631 curInputByte >>= 2;
632 }
633
634 curInputByte = pStreamIdBase[2 * b + 1];
635 for (uint32_t i = 0; i < 4; ++i)
636 {
637 if ((curInputByte & 0x3) != stream)
638 {
639 outByte |= (1 << (i + 4));
640 }
641 curInputByte >>= 2;
642 }
643
644 *pCutBuffer++ = outByte;
645 }
646 }
647
648 THREAD SWR_GS_CONTEXT tlsGsContext;
649
650 //////////////////////////////////////////////////////////////////////////
651 /// @brief Implements GS stage.
652 /// @param pDC - pointer to draw context.
653 /// @param workerId - thread's worker id. Even thread has a unique id.
654 /// @param pa - The primitive assembly object.
655 /// @param pGsOut - output stream for GS
656 template <
657 typename HasStreamOutT,
658 typename HasRastT>
659 static void GeometryShaderStage(
660 DRAW_CONTEXT *pDC,
661 uint32_t workerId,
662 PA_STATE& pa,
663 void* pGsOut,
664 void* pCutBuffer,
665 void* pStreamCutBuffer,
666 uint32_t* pSoPrimData,
667 simdscalari primID)
668 {
669 RDTSC_START(FEGeometryShader);
670
671 SWR_CONTEXT* pContext = pDC->pContext;
672
673 const API_STATE& state = GetApiState(pDC);
674 const SWR_GS_STATE* pState = &state.gsState;
675
676 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
677 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
678
679 tlsGsContext.pStream = (uint8_t*)pGsOut;
680 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
681 tlsGsContext.PrimitiveID = primID;
682
683 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
684 simdvector attrib[MAX_ATTRIBUTES];
685
686 // assemble all attributes for the input primitive
687 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
688 {
689 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
690 pa.Assemble(attribSlot, attrib);
691
692 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
693 {
694 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
695 }
696 }
697
698 // assemble position
699 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
700 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
701 {
702 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
703 }
704
705 const uint32_t vertexStride = sizeof(simdvertex);
706 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
707 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
708 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
709 uint32_t cutPrimStride;
710 uint32_t cutInstanceStride;
711
712 if (pState->isSingleStream)
713 {
714 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
715 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
716 }
717 else
718 {
719 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
720 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
721 }
722
723 // record valid prims from the frontend to avoid over binning the newly generated
724 // prims from the GS
725 uint32_t numInputPrims = pa.NumPrims();
726
727 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
728 {
729 tlsGsContext.InstanceID = instance;
730 tlsGsContext.mask = GenerateMask(numInputPrims);
731
732 // execute the geometry shader
733 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
734
735 tlsGsContext.pStream += instanceStride;
736 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
737 }
738
739 // set up new binner and state for the GS output topology
740 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
741 if (HasRastT::value)
742 {
743 switch (pState->outputTopology)
744 {
745 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
746 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
747 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
748 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
749 }
750 }
751
752 // foreach input prim:
753 // - setup a new PA based on the emitted verts for that prim
754 // - loop over the new verts, calling PA to assemble each prim
755 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
756 uint32_t* pPrimitiveId = (uint32_t*)&primID;
757
758 uint32_t totalPrimsGenerated = 0;
759 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
760 {
761 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
762 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
763 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
764 {
765 uint32_t numEmittedVerts = pVertexCount[inputPrim];
766 if (numEmittedVerts == 0)
767 {
768 continue;
769 }
770
771 uint8_t* pBase = pInstanceBase + instance * instanceStride;
772 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
773
774 uint32_t numAttribs = state.feNumAttributes;
775
776 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
777 {
778 bool processCutVerts = false;
779
780 uint8_t* pCutBuffer = pCutBase;
781
782 // assign default stream ID, only relevant when GS is outputting a single stream
783 uint32_t streamID = 0;
784 if (pState->isSingleStream)
785 {
786 processCutVerts = true;
787 streamID = pState->singleStreamID;
788 if (streamID != stream) continue;
789 }
790 else
791 {
792 // early exit if this stream is not enabled for streamout
793 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
794 {
795 continue;
796 }
797
798 // multi-stream output, need to translate StreamID buffer to a cut buffer
799 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
800 pCutBuffer = (uint8_t*)pStreamCutBuffer;
801 processCutVerts = false;
802 }
803
804 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
805
806 while (gsPa.GetNextStreamOutput())
807 {
808 do
809 {
810 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
811
812 if (assemble)
813 {
814 totalPrimsGenerated += gsPa.NumPrims();
815
816 if (HasStreamOutT::value)
817 {
818 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
819 }
820
821 if (HasRastT::value && state.soState.streamToRasterizer == stream)
822 {
823 simdscalari vPrimId;
824 // pull primitiveID from the GS output if available
825 if (state.gsState.emitsPrimitiveID)
826 {
827 simdvector primIdAttrib[3];
828 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
829 vPrimId = _simd_castps_si(primIdAttrib[0].x);
830 }
831 else
832 {
833 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
834 }
835
836 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
837 }
838 }
839 } while (gsPa.NextPrim());
840 }
841 }
842 }
843 }
844
845 // update GS pipeline stats
846 UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
847 UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
848
849 RDTSC_STOP(FEGeometryShader, 1, 0);
850 }
851
852 //////////////////////////////////////////////////////////////////////////
853 /// @brief Allocate GS buffers
854 /// @param pDC - pointer to draw context.
855 /// @param state - API state
856 /// @param ppGsOut - pointer to GS output buffer allocation
857 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
858 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
859 void **ppStreamCutBuffer)
860 {
861 auto pArena = pDC->pArena;
862 SWR_ASSERT(pArena != nullptr);
863 SWR_ASSERT(state.gsState.gsEnable);
864 // allocate arena space to hold GS output verts
865 // @todo pack attribs
866 // @todo support multiple streams
867 const uint32_t vertexStride = sizeof(simdvertex);
868 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
869 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
870 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
871
872 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
873 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
874 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
875 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
876
877 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
878 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
879
880 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
881 if (state.gsState.isSingleStream)
882 {
883 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
884 *ppStreamCutBuffer = nullptr;
885 }
886 else
887 {
888 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
889 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
890 }
891
892 }
893
894 //////////////////////////////////////////////////////////////////////////
895 /// @brief Contains all data generated by the HS and passed to the
896 /// tessellator and DS.
897 struct TessellationThreadLocalData
898 {
899 SWR_HS_CONTEXT hsContext;
900 ScalarPatch patchData[KNOB_SIMD_WIDTH];
901 void* pTxCtx;
902 size_t tsCtxSize;
903
904 simdscalar* pDSOutput;
905 size_t numDSOutputVectors;
906 };
907
908 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
909
910 //////////////////////////////////////////////////////////////////////////
911 /// @brief Allocate tessellation data for this worker thread.
912 INLINE
913 static void AllocateTessellationData(SWR_CONTEXT* pContext)
914 {
915 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
916 if (gt_pTessellationThreadData == nullptr)
917 {
918 gt_pTessellationThreadData = (TessellationThreadLocalData*)
919 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
920 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
921 }
922 }
923
924 //////////////////////////////////////////////////////////////////////////
925 /// @brief Implements Tessellation Stages.
926 /// @param pDC - pointer to draw context.
927 /// @param workerId - thread's worker id. Even thread has a unique id.
928 /// @param pa - The primitive assembly object.
929 /// @param pGsOut - output stream for GS
930 template <
931 typename HasGeometryShaderT,
932 typename HasStreamOutT,
933 typename HasRastT>
934 static void TessellationStages(
935 DRAW_CONTEXT *pDC,
936 uint32_t workerId,
937 PA_STATE& pa,
938 void* pGsOut,
939 void* pCutBuffer,
940 void* pCutStreamBuffer,
941 uint32_t* pSoPrimData,
942 simdscalari primID)
943 {
944 const API_STATE& state = GetApiState(pDC);
945 const SWR_TS_STATE& tsState = state.tsState;
946 SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
947
948 SWR_ASSERT(gt_pTessellationThreadData);
949
950 HANDLE tsCtx = TSInitCtx(
951 tsState.domain,
952 tsState.partitioning,
953 tsState.tsOutputTopology,
954 gt_pTessellationThreadData->pTxCtx,
955 gt_pTessellationThreadData->tsCtxSize);
956 if (tsCtx == nullptr)
957 {
958 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
959 tsCtx = TSInitCtx(
960 tsState.domain,
961 tsState.partitioning,
962 tsState.tsOutputTopology,
963 gt_pTessellationThreadData->pTxCtx,
964 gt_pTessellationThreadData->tsCtxSize);
965 }
966 SWR_ASSERT(tsCtx);
967
968 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
969 if (HasRastT::value)
970 {
971 switch (tsState.postDSTopology)
972 {
973 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
974 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
975 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
976 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
977 }
978 }
979
980 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
981 hsContext.pCPout = gt_pTessellationThreadData->patchData;
982 hsContext.PrimitiveID = primID;
983
984 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
985 // Max storage for one attribute for an entire simdprimitive
986 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
987
988 // assemble all attributes for the input primitives
989 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
990 {
991 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
992 pa.Assemble(attribSlot, simdattrib);
993
994 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
995 {
996 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
997 }
998 }
999
1000 #if defined(_DEBUG)
1001 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1002 #endif
1003
1004 uint32_t numPrims = pa.NumPrims();
1005 hsContext.mask = GenerateMask(numPrims);
1006
1007 // Run the HS
1008 RDTSC_START(FEHullShader);
1009 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1010 RDTSC_STOP(FEHullShader, 0, 0);
1011
1012 UPDATE_STAT(HsInvocations, numPrims);
1013
1014 const uint32_t* pPrimId = (const uint32_t*)&primID;
1015
1016 for (uint32_t p = 0; p < numPrims; ++p)
1017 {
1018 // Run Tessellator
1019 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1020 RDTSC_START(FETessellation);
1021 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1022 RDTSC_STOP(FETessellation, 0, 0);
1023
1024 if (tsData.NumPrimitives == 0)
1025 {
1026 continue;
1027 }
1028 SWR_ASSERT(tsData.NumDomainPoints);
1029
1030 // Allocate DS Output memory
1031 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1032 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1033 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1034 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1035 {
1036 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1037 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1038 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1039 }
1040 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1041 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1042
1043 #if defined(_DEBUG)
1044 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1045 #endif
1046
1047 // Run Domain Shader
1048 SWR_DS_CONTEXT dsContext;
1049 dsContext.PrimitiveID = pPrimId[p];
1050 dsContext.pCpIn = &hsContext.pCPout[p];
1051 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1052 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1053 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1054 dsContext.vectorStride = requiredDSVectorInvocations;
1055
1056 uint32_t dsInvocations = 0;
1057
1058 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1059 {
1060 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1061
1062 RDTSC_START(FEDomainShader);
1063 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1064 RDTSC_STOP(FEDomainShader, 0, 0);
1065
1066 dsInvocations += KNOB_SIMD_WIDTH;
1067 }
1068 UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1069
1070 PA_TESS tessPa(
1071 pDC,
1072 dsContext.pOutputData,
1073 dsContext.vectorStride,
1074 tsState.numDsOutputAttribs,
1075 tsData.ppIndices,
1076 tsData.NumPrimitives,
1077 tsState.postDSTopology);
1078
1079 while (tessPa.HasWork())
1080 {
1081 if (HasGeometryShaderT::value)
1082 {
1083 GeometryShaderStage<HasStreamOutT, HasRastT>(
1084 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1085 _simd_set1_epi32(dsContext.PrimitiveID));
1086 }
1087 else
1088 {
1089 if (HasStreamOutT::value)
1090 {
1091 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1092 }
1093
1094 if (HasRastT::value)
1095 {
1096 simdvector prim[3]; // Only deal with triangles, lines, or points
1097 RDTSC_START(FEPAAssemble);
1098 #if SWR_ENABLE_ASSERTS
1099 bool assemble =
1100 #endif
1101 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1102 RDTSC_STOP(FEPAAssemble, 1, 0);
1103 SWR_ASSERT(assemble);
1104
1105 SWR_ASSERT(pfnClipFunc);
1106 pfnClipFunc(pDC, tessPa, workerId, prim,
1107 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1108 }
1109 }
1110
1111 tessPa.NextPrim();
1112
1113 } // while (tessPa.HasWork())
1114 } // for (uint32_t p = 0; p < numPrims; ++p)
1115
1116 TSDestroyCtx(tsCtx);
1117 }
1118
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief FE handler for SwrDraw.
1121 /// @tparam IsIndexedT - Is indexed drawing enabled
1122 /// @tparam HasTessellationT - Is tessellation enabled
1123 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1124 /// @tparam HasStreamOutT - Is stream-out enabled
1125 /// @tparam HasRastT - Is rasterization enabled
1126 /// @param pContext - pointer to SWR context.
1127 /// @param pDC - pointer to draw context.
1128 /// @param workerId - thread's worker id.
1129 /// @param pUserData - Pointer to DRAW_WORK
1130 template <
1131 typename IsIndexedT,
1132 typename IsCutIndexEnabledT,
1133 typename HasTessellationT,
1134 typename HasGeometryShaderT,
1135 typename HasStreamOutT,
1136 typename HasRastT>
1137 void ProcessDraw(
1138 SWR_CONTEXT *pContext,
1139 DRAW_CONTEXT *pDC,
1140 uint32_t workerId,
1141 void *pUserData)
1142 {
1143
1144 #if KNOB_ENABLE_TOSS_POINTS
1145 if (KNOB_TOSS_QUEUE_FE)
1146 {
1147 return;
1148 }
1149 #endif
1150
1151 RDTSC_START(FEProcessDraw);
1152
1153 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1154 const API_STATE& state = GetApiState(pDC);
1155 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1156 SWR_VS_CONTEXT vsContext;
1157 simdvertex vin;
1158
1159 int indexSize = 0;
1160 uint32_t endVertex = work.numVerts;
1161
1162 const int32_t* pLastRequestedIndex = nullptr;
1163 if (IsIndexedT::value)
1164 {
1165 switch (work.type)
1166 {
1167 case R32_UINT:
1168 indexSize = sizeof(uint32_t);
1169 pLastRequestedIndex = &(work.pIB[endVertex]);
1170 break;
1171 case R16_UINT:
1172 indexSize = sizeof(uint16_t);
1173 // nasty address offset to last index
1174 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1175 break;
1176 case R8_UINT:
1177 indexSize = sizeof(uint8_t);
1178 // nasty address offset to last index
1179 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1180 break;
1181 default:
1182 SWR_ASSERT(0);
1183 }
1184 }
1185 else
1186 {
1187 // No cuts, prune partial primitives.
1188 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1189 }
1190
1191 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1192 fetchInfo.pStreams = &state.vertexBuffers[0];
1193 fetchInfo.StartInstance = work.startInstance;
1194 fetchInfo.StartVertex = 0;
1195
1196 vsContext.pVin = &vin;
1197
1198 if (IsIndexedT::value)
1199 {
1200 fetchInfo.BaseVertex = work.baseVertex;
1201
1202 // if the entire index buffer isn't being consumed, set the last index
1203 // so that fetches < a SIMD wide will be masked off
1204 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1205 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1206 {
1207 fetchInfo.pLastIndex = pLastRequestedIndex;
1208 }
1209 }
1210 else
1211 {
1212 fetchInfo.StartVertex = work.startVertex;
1213 }
1214
1215 #ifdef KNOB_ENABLE_RDTSC
1216 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1217 #endif
1218
1219 void* pGsOut = nullptr;
1220 void* pCutBuffer = nullptr;
1221 void* pStreamCutBuffer = nullptr;
1222 if (HasGeometryShaderT::value)
1223 {
1224 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1225 }
1226
1227 if (HasTessellationT::value)
1228 {
1229 SWR_ASSERT(state.tsState.tsEnable == true);
1230 SWR_ASSERT(state.pfnHsFunc != nullptr);
1231 SWR_ASSERT(state.pfnDsFunc != nullptr);
1232
1233 AllocateTessellationData(pContext);
1234 }
1235 else
1236 {
1237 SWR_ASSERT(state.tsState.tsEnable == false);
1238 SWR_ASSERT(state.pfnHsFunc == nullptr);
1239 SWR_ASSERT(state.pfnDsFunc == nullptr);
1240 }
1241
1242 // allocate space for streamout input prim data
1243 uint32_t* pSoPrimData = nullptr;
1244 if (HasStreamOutT::value)
1245 {
1246 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1247 }
1248
1249 // choose primitive assembler
1250 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1251 PA_STATE& pa = paFactory.GetPA();
1252
1253 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1254 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1255 {
1256 simdscalari vIndex;
1257 uint32_t i = 0;
1258
1259 if (IsIndexedT::value)
1260 {
1261 fetchInfo.pIndices = work.pIB;
1262 }
1263 else
1264 {
1265 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1266 fetchInfo.pIndices = (const int32_t*)&vIndex;
1267 }
1268
1269 fetchInfo.CurInstance = instanceNum;
1270 vsContext.InstanceID = instanceNum;
1271
1272 while (pa.HasWork())
1273 {
1274 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1275 // So we need to keep this outside of (i < endVertex) check.
1276 simdmask* pvCutIndices = nullptr;
1277 if (IsIndexedT::value)
1278 {
1279 pvCutIndices = &pa.GetNextVsIndices();
1280 }
1281
1282 simdvertex& vout = pa.GetNextVsOutput();
1283 vsContext.pVout = &vout;
1284
1285 if (i < endVertex)
1286 {
1287
1288 // 1. Execute FS/VS for a single SIMD.
1289 RDTSC_START(FEFetchShader);
1290 state.pfnFetchFunc(fetchInfo, vin);
1291 RDTSC_STOP(FEFetchShader, 0, 0);
1292
1293 // forward fetch generated vertex IDs to the vertex shader
1294 vsContext.VertexID = fetchInfo.VertexID;
1295
1296 // Setup active mask for vertex shader.
1297 vsContext.mask = GenerateMask(endVertex - i);
1298
1299 // forward cut mask to the PA
1300 if (IsIndexedT::value)
1301 {
1302 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1303 }
1304
1305 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1306
1307 #if KNOB_ENABLE_TOSS_POINTS
1308 if (!KNOB_TOSS_FETCH)
1309 #endif
1310 {
1311 RDTSC_START(FEVertexShader);
1312 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1313 RDTSC_STOP(FEVertexShader, 0, 0);
1314
1315 UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1316 }
1317 }
1318
1319 // 2. Assemble primitives given the last two SIMD.
1320 do
1321 {
1322 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1323 // PaAssemble returns false if there is not enough verts to assemble.
1324 RDTSC_START(FEPAAssemble);
1325 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1326 RDTSC_STOP(FEPAAssemble, 1, 0);
1327
1328 #if KNOB_ENABLE_TOSS_POINTS
1329 if (!KNOB_TOSS_FETCH)
1330 #endif
1331 {
1332 #if KNOB_ENABLE_TOSS_POINTS
1333 if (!KNOB_TOSS_VS)
1334 #endif
1335 {
1336 if (assemble)
1337 {
1338 UPDATE_STAT(IaPrimitives, pa.NumPrims());
1339
1340 if (HasTessellationT::value)
1341 {
1342 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1343 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1344 }
1345 else if (HasGeometryShaderT::value)
1346 {
1347 GeometryShaderStage<HasStreamOutT, HasRastT>(
1348 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1349 }
1350 else
1351 {
1352 // If streamout is enabled then stream vertices out to memory.
1353 if (HasStreamOutT::value)
1354 {
1355 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1356 }
1357
1358 if (HasRastT::value)
1359 {
1360 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1361 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1362 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1363 }
1364 }
1365 }
1366 }
1367 }
1368 } while (pa.NextPrim());
1369
1370 i += KNOB_SIMD_WIDTH;
1371 if (IsIndexedT::value)
1372 {
1373 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1374 }
1375 else
1376 {
1377 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1378 }
1379 }
1380 pa.Reset();
1381 }
1382
1383 RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1384 }
1385
1386 struct FEDrawChooser
1387 {
1388 typedef PFN_FE_WORK_FUNC FuncType;
1389
1390 template <typename... ArgsB>
1391 static FuncType GetFunc()
1392 {
1393 return ProcessDraw<ArgsB...>;
1394 }
1395 };
1396
1397
1398 // Selector for correct templated Draw front-end function
1399 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1400 bool IsIndexed,
1401 bool IsCutIndexEnabled,
1402 bool HasTessellation,
1403 bool HasGeometryShader,
1404 bool HasStreamOut,
1405 bool HasRasterization)
1406 {
1407 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1408 }
1409
1410 //////////////////////////////////////////////////////////////////////////
1411 /// @brief Processes attributes for the backend based on linkage mask and
1412 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1413 /// @param pDC - Draw context
1414 /// @param pa - Primitive Assembly state
1415 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1416 /// @param pLinkageMap - maps VS attribute slot to PS slot
1417 /// @param triIndex - Triangle to process attributes for
1418 /// @param pBuffer - Output result
1419 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1420 INLINE void ProcessAttributes(
1421 DRAW_CONTEXT *pDC,
1422 PA_STATE&pa,
1423 uint32_t triIndex,
1424 uint32_t primId,
1425 float *pBuffer)
1426 {
1427 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1428 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1429 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1430 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1431 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1432 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1433
1434 static const float constTable[3][4] = {
1435 {0.0f, 0.0f, 0.0f, 0.0f},
1436 {0.0f, 0.0f, 0.0f, 1.0f},
1437 {1.0f, 1.0f, 1.0f, 1.0f}
1438 };
1439
1440 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1441 {
1442 uint32_t inputSlot;
1443 if (IsSwizzledT::value)
1444 {
1445 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1446 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1447
1448 }
1449 else
1450 {
1451 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1452 }
1453
1454 __m128 attrib[3]; // triangle attribs (always 4 wide)
1455 float* pAttribStart = pBuffer;
1456
1457 if (HasConstantInterpT::value || IsDegenerate::value)
1458 {
1459 if (_bittest(&constantInterpMask, i))
1460 {
1461 uint32_t vid;
1462 uint32_t adjustedTriIndex;
1463 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1464 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1465 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1466 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1467 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1468
1469 switch (topo) {
1470 case TOP_QUAD_LIST:
1471 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1472 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1473 break;
1474 case TOP_QUAD_STRIP:
1475 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1476 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1477 break;
1478 case TOP_TRIANGLE_STRIP:
1479 adjustedTriIndex = triIndex;
1480 vid = (triIndex & 1)
1481 ? tristripProvokingVertex[provokingVertex]
1482 : provokingVertex;
1483 break;
1484 default:
1485 adjustedTriIndex = triIndex;
1486 vid = provokingVertex;
1487 break;
1488 }
1489
1490 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1491
1492 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1493 {
1494 _mm_store_ps(pBuffer, attrib[vid]);
1495 pBuffer += 4;
1496 }
1497 }
1498 else
1499 {
1500 pa.AssembleSingle(inputSlot, triIndex, attrib);
1501
1502 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1503 {
1504 _mm_store_ps(pBuffer, attrib[i]);
1505 pBuffer += 4;
1506 }
1507 }
1508 }
1509 else
1510 {
1511 pa.AssembleSingle(inputSlot, triIndex, attrib);
1512
1513 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1514 {
1515 _mm_store_ps(pBuffer, attrib[i]);
1516 pBuffer += 4;
1517 }
1518 }
1519
1520 // pad out the attrib buffer to 3 verts to ensure the triangle
1521 // interpolation code in the pixel shader works correctly for the
1522 // 3 topologies - point, line, tri. This effectively zeros out the
1523 // effect of the missing vertices in the triangle interpolation.
1524 for (uint32_t v = NumVertsT::value; v < 3; ++v)
1525 {
1526 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1527 pBuffer += 4;
1528 }
1529
1530 // check for constant source overrides
1531 if (IsSwizzledT::value)
1532 {
1533 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1534 if (mask)
1535 {
1536 DWORD comp;
1537 while (_BitScanForward(&comp, mask))
1538 {
1539 mask &= ~(1 << comp);
1540
1541 float constantValue = 0.0f;
1542 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1543 {
1544 case SWR_CONSTANT_SOURCE_CONST_0000:
1545 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1546 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1547 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1548 break;
1549 case SWR_CONSTANT_SOURCE_PRIM_ID:
1550 constantValue = *(float*)&primId;
1551 break;
1552 }
1553
1554 // apply constant value to all 3 vertices
1555 for (uint32_t v = 0; v < 3; ++v)
1556 {
1557 pAttribStart[comp + v * 4] = constantValue;
1558 }
1559 }
1560 }
1561 }
1562 }
1563 }
1564
1565
1566 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1567
1568 struct ProcessAttributesChooser
1569 {
1570 typedef PFN_PROCESS_ATTRIBUTES FuncType;
1571
1572 template <typename... ArgsB>
1573 static FuncType GetFunc()
1574 {
1575 return ProcessAttributes<ArgsB...>;
1576 }
1577 };
1578
1579 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1580 {
1581 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1582 }
1583
1584 //////////////////////////////////////////////////////////////////////////
1585 /// @brief Processes enabled user clip distances. Loads the active clip
1586 /// distances from the PA, sets up barycentric equations, and
1587 /// stores the results to the output buffer
1588 /// @param pa - Primitive Assembly state
1589 /// @param primIndex - primitive index to process
1590 /// @param clipDistMask - mask of enabled clip distances
1591 /// @param pUserClipBuffer - buffer to store results
1592 template<uint32_t NumVerts>
1593 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1594 {
1595 DWORD clipDist;
1596 while (_BitScanForward(&clipDist, clipDistMask))
1597 {
1598 clipDistMask &= ~(1 << clipDist);
1599 uint32_t clipSlot = clipDist >> 2;
1600 uint32_t clipComp = clipDist & 0x3;
1601 uint32_t clipAttribSlot = clipSlot == 0 ?
1602 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1603
1604 __m128 primClipDist[3];
1605 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1606
1607 float vertClipDist[NumVerts];
1608 for (uint32_t e = 0; e < NumVerts; ++e)
1609 {
1610 OSALIGNSIMD(float) aVertClipDist[4];
1611 _mm_store_ps(aVertClipDist, primClipDist[e]);
1612 vertClipDist[e] = aVertClipDist[clipComp];
1613 };
1614
1615 // setup plane equations for barycentric interpolation in the backend
1616 float baryCoeff[NumVerts];
1617 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1618 {
1619 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1620 }
1621 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1622
1623 for (uint32_t e = 0; e < NumVerts; ++e)
1624 {
1625 *(pUserClipBuffer++) = baryCoeff[e];
1626 }
1627 }
1628 }
1629
1630 //////////////////////////////////////////////////////////////////////////
1631 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1632 /// Point precision from FP32.
1633 template <typename PT = FixedPointTraits<Fixed_16_8>>
1634 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1635 {
1636 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1637 return _simd_cvtps_epi32(vFixed);
1638 }
1639
1640 //////////////////////////////////////////////////////////////////////////
1641 /// @brief Helper function to set the X,Y coords of a triangle to the
1642 /// requested Fixed Point precision from FP32.
1643 /// @param tri: simdvector[3] of FP triangle verts
1644 /// @param vXi: fixed point X coords of tri verts
1645 /// @param vYi: fixed point Y coords of tri verts
1646 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1647 {
1648 vXi[0] = fpToFixedPointVertical(tri[0].x);
1649 vYi[0] = fpToFixedPointVertical(tri[0].y);
1650 vXi[1] = fpToFixedPointVertical(tri[1].x);
1651 vYi[1] = fpToFixedPointVertical(tri[1].y);
1652 vXi[2] = fpToFixedPointVertical(tri[2].x);
1653 vYi[2] = fpToFixedPointVertical(tri[2].y);
1654 }
1655
1656 //////////////////////////////////////////////////////////////////////////
1657 /// @brief Calculate bounding box for current triangle
1658 /// @tparam CT: ConservativeRastFETraits type
1659 /// @param vX: fixed point X position for triangle verts
1660 /// @param vY: fixed point Y position for triangle verts
1661 /// @param bbox: fixed point bbox
1662 /// *Note*: expects vX, vY to be in the correct precision for the type
1663 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1664 template <typename CT>
1665 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1666 {
1667 simdscalari vMinX = vX[0];
1668 vMinX = _simd_min_epi32(vMinX, vX[1]);
1669 vMinX = _simd_min_epi32(vMinX, vX[2]);
1670
1671 simdscalari vMaxX = vX[0];
1672 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1673 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1674
1675 simdscalari vMinY = vY[0];
1676 vMinY = _simd_min_epi32(vMinY, vY[1]);
1677 vMinY = _simd_min_epi32(vMinY, vY[2]);
1678
1679 simdscalari vMaxY = vY[0];
1680 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1681 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1682
1683 bbox.left = vMinX;
1684 bbox.right = vMaxX;
1685 bbox.top = vMinY;
1686 bbox.bottom = vMaxY;
1687 }
1688
1689 //////////////////////////////////////////////////////////////////////////
1690 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1691 /// Offsets BBox for conservative rast
1692 template <>
1693 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1694 {
1695 // FE conservative rast traits
1696 typedef FEConservativeRastT CT;
1697
1698 simdscalari vMinX = vX[0];
1699 vMinX = _simd_min_epi32(vMinX, vX[1]);
1700 vMinX = _simd_min_epi32(vMinX, vX[2]);
1701
1702 simdscalari vMaxX = vX[0];
1703 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1704 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1705
1706 simdscalari vMinY = vY[0];
1707 vMinY = _simd_min_epi32(vMinY, vY[1]);
1708 vMinY = _simd_min_epi32(vMinY, vY[2]);
1709
1710 simdscalari vMaxY = vY[0];
1711 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1712 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1713
1714 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1715 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1716 bbox.left = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1717 bbox.right = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1718 bbox.top = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1719 bbox.bottom = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1720 }
1721
1722 //////////////////////////////////////////////////////////////////////////
1723 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1724 /// culling, viewport transform, etc.
1725 /// @param pDC - pointer to draw context.
1726 /// @param pa - The primitive assembly object.
1727 /// @param workerId - thread's worker id. Even thread has a unique id.
1728 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1729 /// @param primID - Primitive ID for each triangle.
1730 /// @tparam CT - ConservativeRastFETraits
1731 template <typename CT>
1732 void BinTriangles(
1733 DRAW_CONTEXT *pDC,
1734 PA_STATE& pa,
1735 uint32_t workerId,
1736 simdvector tri[3],
1737 uint32_t triMask,
1738 simdscalari primID)
1739 {
1740 RDTSC_START(FEBinTriangles);
1741
1742 const API_STATE& state = GetApiState(pDC);
1743 const SWR_RASTSTATE& rastState = state.rastState;
1744 const SWR_FRONTEND_STATE& feState = state.frontendState;
1745 const SWR_GS_STATE& gsState = state.gsState;
1746 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1747
1748
1749 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1750 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1751 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1752
1753 if (!feState.vpTransformDisable)
1754 {
1755 // perspective divide
1756 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1757 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1758 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1759
1760 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1761 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1762 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1763
1764 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1765 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1766 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1767
1768 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1769 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1770 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1771
1772 // viewport transform to screen coords
1773 viewportTransform<3>(tri, state.vpMatrices);
1774 }
1775
1776 // adjust for pixel center location
1777 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1778 tri[0].x = _simd_add_ps(tri[0].x, offset);
1779 tri[0].y = _simd_add_ps(tri[0].y, offset);
1780
1781 tri[1].x = _simd_add_ps(tri[1].x, offset);
1782 tri[1].y = _simd_add_ps(tri[1].y, offset);
1783
1784 tri[2].x = _simd_add_ps(tri[2].x, offset);
1785 tri[2].y = _simd_add_ps(tri[2].y, offset);
1786
1787 simdscalari vXi[3], vYi[3];
1788 // Set vXi, vYi to required fixed point precision
1789 FPToFixedPoint(tri, vXi, vYi);
1790
1791 // triangle setup
1792 simdscalari vAi[3], vBi[3];
1793 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1794
1795 // determinant
1796 simdscalari vDet[2];
1797 calcDeterminantIntVertical(vAi, vBi, vDet);
1798
1799 // cull zero area
1800 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1801 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1802
1803 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1804
1805 uint32_t origTriMask = triMask;
1806 // don't cull degenerate triangles if we're conservatively rasterizing
1807 if(!CT::IsConservativeT::value)
1808 {
1809 triMask &= ~cullZeroAreaMask;
1810 }
1811
1812 // determine front winding tris
1813 // CW +det
1814 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1815 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1816 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1817 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1818
1819 uint32_t frontWindingTris;
1820 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1821 {
1822 frontWindingTris = cwTriMask;
1823 }
1824 else
1825 {
1826 frontWindingTris = ~cwTriMask;
1827 }
1828
1829 // cull
1830 uint32_t cullTris;
1831 switch ((SWR_CULLMODE)rastState.cullMode)
1832 {
1833 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1834 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1835 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1836 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1837 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1838 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1839 }
1840
1841 triMask &= ~cullTris;
1842
1843 if (origTriMask ^ triMask)
1844 {
1845 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1846 }
1847
1848 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1849 // compute per tri backface
1850 uint32_t frontFaceMask = frontWindingTris;
1851 uint32_t *pPrimID = (uint32_t *)&primID;
1852 DWORD triIndex = 0;
1853 // for center sample pattern, all samples are at pixel center; calculate coverage
1854 // once at center and broadcast the results in the backend
1855 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1856 uint32_t edgeEnable;
1857 PFN_WORK_FUNC pfnWork;
1858 if(CT::IsConservativeT::value)
1859 {
1860 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1861 // used to call the appropriate templated rasterizer function
1862 if(cullZeroAreaMask > 0)
1863 {
1864 // e0 = v1-v0
1865 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1866 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1867 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1868
1869 // e1 = v2-v1
1870 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1871 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1872 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1873
1874 // e2 = v0-v2
1875 // if v0 == v1 & v1 == v2, v0 == v2
1876 uint32_t e2Mask = e0Mask & e1Mask;
1877 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1878
1879 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1880 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1881 e0Mask = pdep_u32(e0Mask, 0x00249249);
1882 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1883 e1Mask = pdep_u32(e1Mask, 0x00492492);
1884 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1885 e2Mask = pdep_u32(e2Mask, 0x00924924);
1886
1887 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1888 }
1889 else
1890 {
1891 edgeEnable = 0x00FFFFFF;
1892 }
1893 }
1894 else
1895 {
1896 // degenerate triangles won't be sent to rasterizer; just enable all edges
1897 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1898 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1899 (rastState.scissorEnable > 0));
1900 }
1901
1902 if (!triMask)
1903 {
1904 goto endBinTriangles;
1905 }
1906
1907 // Calc bounding box of triangles
1908 simdBBox bbox;
1909 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1910
1911 // determine if triangle falls between pixel centers and discard
1912 // only discard for non-MSAA case and when conservative rast is disabled
1913 // (left + 127) & ~255
1914 // (right + 128) & ~255
1915 if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1916 {
1917 origTriMask = triMask;
1918
1919 int cullCenterMask;
1920 {
1921 simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1922 left = _simd_and_si(left, _simd_set1_epi32(~255));
1923 simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1924 right = _simd_and_si(right, _simd_set1_epi32(~255));
1925
1926 simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1927
1928 simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1929 top = _simd_and_si(top, _simd_set1_epi32(~255));
1930 simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1931 bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1932
1933 simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1934 vMaskV = _simd_or_si(vMaskH, vMaskV);
1935 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1936 }
1937
1938 triMask &= ~cullCenterMask;
1939
1940 if(origTriMask ^ triMask)
1941 {
1942 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1943 }
1944 }
1945
1946 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1947 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1948 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1949 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1950 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1951
1952 if(CT::IsConservativeT::value)
1953 {
1954 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1955 // some area. Bump the right/bottom edges out
1956 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.top, bbox.bottom);
1957 bbox.bottom = _simd_blendv_epi32(bbox.bottom, _simd_add_epi32(bbox.bottom, _simd_set1_epi32(1)), topEqualsBottom);
1958 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.left, bbox.right);
1959 bbox.right = _simd_blendv_epi32(bbox.right, _simd_add_epi32(bbox.right, _simd_set1_epi32(1)), leftEqualsRight);
1960 }
1961
1962 // Cull tris completely outside scissor
1963 {
1964 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1965 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1966 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1967 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1968 triMask = triMask & ~maskOutsideScissor;
1969 }
1970
1971 if (!triMask)
1972 {
1973 goto endBinTriangles;
1974 }
1975
1976 // Convert triangle bbox to macrotile units.
1977 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1978 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1979 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1980 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1981
1982 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1983 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
1984 _simd_store_si((simdscalari*)aMTRight, bbox.right);
1985 _simd_store_si((simdscalari*)aMTTop, bbox.top);
1986 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
1987
1988 // transpose verts needed for backend
1989 /// @todo modify BE to take non-transformed verts
1990 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1991 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1992 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1993 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1994 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1995
1996 // store render target array index
1997 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1998 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1999 {
2000 simdvector vRtai[3];
2001 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2002 simdscalari vRtaii;
2003 vRtaii = _simd_castps_si(vRtai[0].x);
2004 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2005 }
2006 else
2007 {
2008 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2009 }
2010
2011 // scan remaining valid triangles and bin each separately
2012 while (_BitScanForward(&triIndex, triMask))
2013 {
2014 uint32_t linkageCount = state.backendState.numAttributes;
2015 uint32_t numScalarAttribs = linkageCount * 4;
2016
2017 BE_WORK work;
2018 work.type = DRAW;
2019
2020 bool isDegenerate;
2021 if(CT::IsConservativeT::value)
2022 {
2023 // only rasterize valid edges if we have a degenerate primitive
2024 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2025 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2026 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2027 (rastState.scissorEnable > 0));
2028
2029 // Degenerate triangles are required to be constant interpolated
2030 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2031 }
2032 else
2033 {
2034 isDegenerate = false;
2035 work.pfnWork = pfnWork;
2036 }
2037
2038 // Select attribute processor
2039 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2040 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
2041
2042 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2043
2044 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2045 desc.triFlags.primID = pPrimID[triIndex];
2046 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2047
2048 auto pArena = pDC->pArena;
2049 SWR_ASSERT(pArena != nullptr);
2050
2051 // store active attribs
2052 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2053 desc.pAttribs = pAttribs;
2054 desc.numAttribs = linkageCount;
2055 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2056
2057 // store triangle vertex data
2058 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2059
2060 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2061 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2062 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2063 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2064
2065 // store user clip distances
2066 if (rastState.clipDistanceMask)
2067 {
2068 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2069 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2070 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2071 }
2072
2073 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2074 {
2075 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2076 {
2077 #if KNOB_ENABLE_TOSS_POINTS
2078 if (!KNOB_TOSS_SETUP_TRIS)
2079 #endif
2080 {
2081 pTileMgr->enqueue(x, y, &work);
2082 }
2083 }
2084 }
2085 triMask &= ~(1 << triIndex);
2086 }
2087
2088 endBinTriangles:
2089 RDTSC_STOP(FEBinTriangles, 1, 0);
2090 }
2091
2092 struct FEBinTrianglesChooser
2093 {
2094 typedef PFN_PROCESS_PRIMS FuncType;
2095
2096 template <typename... ArgsB>
2097 static FuncType GetFunc()
2098 {
2099 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2100 }
2101 };
2102
2103 // Selector for correct templated BinTrinagles function
2104 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2105 {
2106 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2107 }
2108
2109 //////////////////////////////////////////////////////////////////////////
2110 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2111 /// @param pDC - pointer to draw context.
2112 /// @param pa - The primitive assembly object.
2113 /// @param workerId - thread's worker id. Even thread has a unique id.
2114 /// @param tri - Contains point position data for SIMDs worth of points.
2115 /// @param primID - Primitive ID for each point.
2116 void BinPoints(
2117 DRAW_CONTEXT *pDC,
2118 PA_STATE& pa,
2119 uint32_t workerId,
2120 simdvector prim[3],
2121 uint32_t primMask,
2122 simdscalari primID)
2123 {
2124 RDTSC_START(FEBinPoints);
2125
2126 simdvector& primVerts = prim[0];
2127
2128 const API_STATE& state = GetApiState(pDC);
2129 const SWR_FRONTEND_STATE& feState = state.frontendState;
2130 const SWR_GS_STATE& gsState = state.gsState;
2131 const SWR_RASTSTATE& rastState = state.rastState;
2132
2133 // Select attribute processor
2134 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2135 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2136
2137 if (!feState.vpTransformDisable)
2138 {
2139 // perspective divide
2140 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2141 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2142 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2143 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2144
2145 // viewport transform to screen coords
2146 viewportTransform<1>(&primVerts, state.vpMatrices);
2147 }
2148
2149 // adjust for pixel center location
2150 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2151 primVerts.x = _simd_add_ps(primVerts.x, offset);
2152 primVerts.y = _simd_add_ps(primVerts.y, offset);
2153
2154 // convert to fixed point
2155 simdscalari vXi, vYi;
2156 vXi = fpToFixedPointVertical(primVerts.x);
2157 vYi = fpToFixedPointVertical(primVerts.y);
2158
2159 if (CanUseSimplePoints(pDC))
2160 {
2161 // adjust for top-left rule
2162 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2163 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2164
2165 // cull points off the top-left edge of the viewport
2166 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2167 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2168
2169 // compute macro tile coordinates
2170 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2171 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2172
2173 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2174 _simd_store_si((simdscalari*)aMacroX, macroX);
2175 _simd_store_si((simdscalari*)aMacroY, macroY);
2176
2177 // compute raster tile coordinates
2178 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2179 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2180
2181 // compute raster tile relative x,y for coverage mask
2182 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2183 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2184
2185 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2186 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2187
2188 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2189 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2190 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2191 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2192
2193 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2194 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2195 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2196 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2197
2198 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2199 _simd_store_ps((float*)aZ, primVerts.z);
2200
2201 // store render target array index
2202 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2203 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2204 {
2205 simdvector vRtai;
2206 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2207 simdscalari vRtaii = _simd_castps_si(vRtai.x);
2208 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2209 }
2210 else
2211 {
2212 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2213 }
2214
2215 uint32_t *pPrimID = (uint32_t *)&primID;
2216 DWORD primIndex = 0;
2217
2218 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2219
2220 // scan remaining valid triangles and bin each separately
2221 while (_BitScanForward(&primIndex, primMask))
2222 {
2223 uint32_t linkageCount = backendState.numAttributes;
2224 uint32_t numScalarAttribs = linkageCount * 4;
2225
2226 BE_WORK work;
2227 work.type = DRAW;
2228
2229 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2230
2231 // points are always front facing
2232 desc.triFlags.frontFacing = 1;
2233 desc.triFlags.primID = pPrimID[primIndex];
2234 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2235
2236 work.pfnWork = RasterizeSimplePoint;
2237
2238 auto pArena = pDC->pArena;
2239 SWR_ASSERT(pArena != nullptr);
2240
2241 // store attributes
2242 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2243 desc.pAttribs = pAttribs;
2244 desc.numAttribs = linkageCount;
2245
2246 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2247
2248 // store raster tile aligned x, y, perspective correct z
2249 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2250 desc.pTriBuffer = pTriBuffer;
2251 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2252 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2253 *pTriBuffer = aZ[primIndex];
2254
2255 uint32_t tX = aTileRelativeX[primIndex];
2256 uint32_t tY = aTileRelativeY[primIndex];
2257
2258 // pack the relative x,y into the coverageMask, the rasterizer will
2259 // generate the true coverage mask from it
2260 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2261
2262 // bin it
2263 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2264 #if KNOB_ENABLE_TOSS_POINTS
2265 if (!KNOB_TOSS_SETUP_TRIS)
2266 #endif
2267 {
2268 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2269 }
2270 primMask &= ~(1 << primIndex);
2271 }
2272 }
2273 else
2274 {
2275 // non simple points need to be potentially binned to multiple macro tiles
2276 simdscalar vPointSize;
2277 if (rastState.pointParam)
2278 {
2279 simdvector size[3];
2280 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2281 vPointSize = size[0].x;
2282 }
2283 else
2284 {
2285 vPointSize = _simd_set1_ps(rastState.pointSize);
2286 }
2287
2288 // bloat point to bbox
2289 simdBBox bbox;
2290 bbox.left = bbox.right = vXi;
2291 bbox.top = bbox.bottom = vYi;
2292
2293 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2294 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2295 bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2296 bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2297 bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2298 bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2299
2300 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2301 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2302 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2303 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2304 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2305
2306 // Cull bloated points completely outside scissor
2307 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2308 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2309 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2310 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2311 primMask = primMask & ~maskOutsideScissor;
2312
2313 // Convert bbox to macrotile units.
2314 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2315 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2316 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2317 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2318
2319 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2320 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2321 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2322 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2323 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2324
2325 // store render target array index
2326 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2327 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2328 {
2329 simdvector vRtai[2];
2330 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2331 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2332 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2333 }
2334 else
2335 {
2336 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2337 }
2338
2339 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2340 _simd_store_ps((float*)aPointSize, vPointSize);
2341
2342 uint32_t *pPrimID = (uint32_t *)&primID;
2343
2344 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2345 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2346 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2347
2348 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2349 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2350 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2351
2352 // scan remaining valid prims and bin each separately
2353 const SWR_BACKEND_STATE& backendState = state.backendState;
2354 DWORD primIndex;
2355 while (_BitScanForward(&primIndex, primMask))
2356 {
2357 uint32_t linkageCount = backendState.numAttributes;
2358 uint32_t numScalarAttribs = linkageCount * 4;
2359
2360 BE_WORK work;
2361 work.type = DRAW;
2362
2363 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2364
2365 desc.triFlags.frontFacing = 1;
2366 desc.triFlags.primID = pPrimID[primIndex];
2367 desc.triFlags.pointSize = aPointSize[primIndex];
2368 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2369
2370 work.pfnWork = RasterizeTriPoint;
2371
2372 auto pArena = pDC->pArena;
2373 SWR_ASSERT(pArena != nullptr);
2374
2375 // store active attribs
2376 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2377 desc.numAttribs = linkageCount;
2378 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2379
2380 // store point vertex data
2381 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2382 desc.pTriBuffer = pTriBuffer;
2383 *pTriBuffer++ = aPrimVertsX[primIndex];
2384 *pTriBuffer++ = aPrimVertsY[primIndex];
2385 *pTriBuffer = aPrimVertsZ[primIndex];
2386
2387 // store user clip distances
2388 if (rastState.clipDistanceMask)
2389 {
2390 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2391 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2392 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2393 }
2394
2395 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2396 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2397 {
2398 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2399 {
2400 #if KNOB_ENABLE_TOSS_POINTS
2401 if (!KNOB_TOSS_SETUP_TRIS)
2402 #endif
2403 {
2404 pTileMgr->enqueue(x, y, &work);
2405 }
2406 }
2407 }
2408
2409 primMask &= ~(1 << primIndex);
2410 }
2411 }
2412
2413
2414
2415
2416 RDTSC_STOP(FEBinPoints, 1, 0);
2417 }
2418
2419 //////////////////////////////////////////////////////////////////////////
2420 /// @brief Bin SIMD lines to the backend.
2421 /// @param pDC - pointer to draw context.
2422 /// @param pa - The primitive assembly object.
2423 /// @param workerId - thread's worker id. Even thread has a unique id.
2424 /// @param tri - Contains line position data for SIMDs worth of points.
2425 /// @param primID - Primitive ID for each line.
2426 void BinLines(
2427 DRAW_CONTEXT *pDC,
2428 PA_STATE& pa,
2429 uint32_t workerId,
2430 simdvector prim[],
2431 uint32_t primMask,
2432 simdscalari primID)
2433 {
2434 RDTSC_START(FEBinLines);
2435
2436 const API_STATE& state = GetApiState(pDC);
2437 const SWR_RASTSTATE& rastState = state.rastState;
2438 const SWR_FRONTEND_STATE& feState = state.frontendState;
2439 const SWR_GS_STATE& gsState = state.gsState;
2440
2441 // Select attribute processor
2442 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2443 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2444
2445 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2446 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2447
2448 if (!feState.vpTransformDisable)
2449 {
2450 // perspective divide
2451 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2452 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2453
2454 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2455 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2456
2457 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2458 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2459
2460 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2461 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2462
2463 // viewport transform to screen coords
2464 viewportTransform<2>(prim, state.vpMatrices);
2465 }
2466
2467 // adjust for pixel center location
2468 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2469 prim[0].x = _simd_add_ps(prim[0].x, offset);
2470 prim[0].y = _simd_add_ps(prim[0].y, offset);
2471
2472 prim[1].x = _simd_add_ps(prim[1].x, offset);
2473 prim[1].y = _simd_add_ps(prim[1].y, offset);
2474
2475 // convert to fixed point
2476 simdscalari vXi[2], vYi[2];
2477 vXi[0] = fpToFixedPointVertical(prim[0].x);
2478 vYi[0] = fpToFixedPointVertical(prim[0].y);
2479 vXi[1] = fpToFixedPointVertical(prim[1].x);
2480 vYi[1] = fpToFixedPointVertical(prim[1].y);
2481
2482 // compute x-major vs y-major mask
2483 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2484 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2485 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2486 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2487
2488 // cull zero-length lines
2489 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2490 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2491
2492 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2493
2494 uint32_t *pPrimID = (uint32_t *)&primID;
2495
2496 simdscalar vUnused = _simd_setzero_ps();
2497
2498 // Calc bounding box of lines
2499 simdBBox bbox;
2500 bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2501 bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2502 bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2503 bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2504
2505 // bloat bbox by line width along minor axis
2506 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2507 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2508 simdBBox bloatBox;
2509 bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2510 bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2511 bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2512 bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2513
2514 bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2515 bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2516 bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2517 bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2518
2519 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2520 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2521 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2522 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2523 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2524
2525 // Cull prims completely outside scissor
2526 {
2527 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2528 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2529 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2530 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2531 primMask = primMask & ~maskOutsideScissor;
2532 }
2533
2534 if (!primMask)
2535 {
2536 goto endBinLines;
2537 }
2538
2539 // Convert triangle bbox to macrotile units.
2540 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2541 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2542 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2543 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2544
2545 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2546 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2547 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2548 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2549 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2550
2551 // transpose verts needed for backend
2552 /// @todo modify BE to take non-transformed verts
2553 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2554 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2555 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2556 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2557 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2558
2559 // store render target array index
2560 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2561 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2562 {
2563 simdvector vRtai[2];
2564 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2565 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2566 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2567 }
2568 else
2569 {
2570 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2571 }
2572
2573 // scan remaining valid prims and bin each separately
2574 DWORD primIndex;
2575 while (_BitScanForward(&primIndex, primMask))
2576 {
2577 uint32_t linkageCount = state.backendState.numAttributes;
2578 uint32_t numScalarAttribs = linkageCount * 4;
2579
2580 BE_WORK work;
2581 work.type = DRAW;
2582
2583 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2584
2585 desc.triFlags.frontFacing = 1;
2586 desc.triFlags.primID = pPrimID[primIndex];
2587 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2588 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2589
2590 work.pfnWork = RasterizeLine;
2591
2592 auto pArena = pDC->pArena;
2593 SWR_ASSERT(pArena != nullptr);
2594
2595 // store active attribs
2596 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2597 desc.numAttribs = linkageCount;
2598 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2599
2600 // store line vertex data
2601 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2602 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2603 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2604 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2605 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2606
2607 // store user clip distances
2608 if (rastState.clipDistanceMask)
2609 {
2610 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2611 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2612 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2613 }
2614
2615 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2616 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2617 {
2618 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2619 {
2620 #if KNOB_ENABLE_TOSS_POINTS
2621 if (!KNOB_TOSS_SETUP_TRIS)
2622 #endif
2623 {
2624 pTileMgr->enqueue(x, y, &work);
2625 }
2626 }
2627 }
2628
2629 primMask &= ~(1 << primIndex);
2630 }
2631
2632 endBinLines:
2633
2634 RDTSC_STOP(FEBinLines, 1, 0);
2635 }