swr: [rasterizer core] routing of viewport indexes through frontend
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
37 #include "utils.h"
38 #include "threads.h"
39 #include "pa.h"
40 #include "clip.h"
41 #include "tilemgr.h"
42 #include "tessellator.h"
43 #include <limits>
44
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE uint32_t GenMask(uint32_t numBits)
48 {
49 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
55 /// raster state.
56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
69 void ProcessSync(
70 SWR_CONTEXT *pContext,
71 DRAW_CONTEXT *pDC,
72 uint32_t workerId,
73 void *pUserData)
74 {
75 BE_WORK work;
76 work.type = SYNC;
77 work.pfnWork = ProcessSyncBE;
78
79 MacroTileMgr *pTileMgr = pDC->pTileMgr;
80 pTileMgr->enqueue(0, 0, &work);
81 }
82
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrClearRenderTarget.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to clear callback.
89 /// @todo This should go away when we switch this to use compute threading.
90 void ProcessClear(
91 SWR_CONTEXT *pContext,
92 DRAW_CONTEXT *pDC,
93 uint32_t workerId,
94 void *pUserData)
95 {
96 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
97 MacroTileMgr *pTileMgr = pDC->pTileMgr;
98
99 const API_STATE& state = GetApiState(pDC);
100
101 // queue a clear to each macro tile
102 // compute macro tile bounds for the current scissor/viewport
103 uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
104 uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
105 uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
106 uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
107
108 BE_WORK work;
109 work.type = CLEAR;
110 work.pfnWork = ProcessClearBE;
111 work.desc.clear = *pClear;
112
113 for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
114 {
115 for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
116 {
117 pTileMgr->enqueue(x, y, &work);
118 }
119 }
120 }
121
122 //////////////////////////////////////////////////////////////////////////
123 /// @brief FE handler for SwrStoreTiles.
124 /// @param pContext - pointer to SWR context.
125 /// @param pDC - pointer to draw context.
126 /// @param workerId - thread's worker id. Even thread has a unique id.
127 /// @param pUserData - Pointer to user data passed back to callback.
128 /// @todo This should go away when we switch this to use compute threading.
129 void ProcessStoreTiles(
130 SWR_CONTEXT *pContext,
131 DRAW_CONTEXT *pDC,
132 uint32_t workerId,
133 void *pUserData)
134 {
135 RDTSC_START(FEProcessStoreTiles);
136 STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
137 MacroTileMgr *pTileMgr = pDC->pTileMgr;
138
139 const API_STATE& state = GetApiState(pDC);
140
141 // queue a store to each macro tile
142 // compute macro tile bounds for the current render target
143 const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
144 const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
145
146 uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
147 uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
148
149 // store tiles
150 BE_WORK work;
151 work.type = STORETILES;
152 work.pfnWork = ProcessStoreTileBE;
153 work.desc.storeTiles = *pStore;
154
155 for (uint32_t x = 0; x < numMacroTilesX; ++x)
156 {
157 for (uint32_t y = 0; y < numMacroTilesY; ++y)
158 {
159 pTileMgr->enqueue(x, y, &work);
160 }
161 }
162
163 RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
164 }
165
166 //////////////////////////////////////////////////////////////////////////
167 /// @brief FE handler for SwrInvalidateTiles.
168 /// @param pContext - pointer to SWR context.
169 /// @param pDC - pointer to draw context.
170 /// @param workerId - thread's worker id. Even thread has a unique id.
171 /// @param pUserData - Pointer to user data passed back to callback.
172 /// @todo This should go away when we switch this to use compute threading.
173 void ProcessDiscardInvalidateTiles(
174 SWR_CONTEXT *pContext,
175 DRAW_CONTEXT *pDC,
176 uint32_t workerId,
177 void *pUserData)
178 {
179 RDTSC_START(FEProcessInvalidateTiles);
180 DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
181 MacroTileMgr *pTileMgr = pDC->pTileMgr;
182
183 SWR_RECT rect;
184
185 if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
186 {
187 // Valid rect
188 rect = pInv->rect;
189 }
190 else
191 {
192 // Use viewport dimensions
193 const API_STATE& state = GetApiState(pDC);
194
195 rect.left = (uint32_t)state.vp[0].x;
196 rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
197 rect.top = (uint32_t)state.vp[0].y;
198 rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
199 }
200
201 // queue a store to each macro tile
202 // compute macro tile bounds for the current render target
203 uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
204 uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
205
206 // Setup region assuming full tiles
207 uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
208 uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
209
210 uint32_t macroTileEndX = rect.right / macroWidth;
211 uint32_t macroTileEndY = rect.bottom / macroHeight;
212
213 if (pInv->fullTilesOnly == false)
214 {
215 // include partial tiles
216 macroTileStartX = rect.left / macroWidth;
217 macroTileStartY = rect.top / macroHeight;
218
219 macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
220 macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
221 }
222
223 SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
224 SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
225
226 macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
227 macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
228
229 // load tiles
230 BE_WORK work;
231 work.type = DISCARDINVALIDATETILES;
232 work.pfnWork = ProcessDiscardInvalidateTilesBE;
233 work.desc.discardInvalidateTiles = *pInv;
234
235 for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
236 {
237 for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
238 {
239 pTileMgr->enqueue(x, y, &work);
240 }
241 }
242
243 RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
244 }
245
246 //////////////////////////////////////////////////////////////////////////
247 /// @brief Computes the number of primitives given the number of verts.
248 /// @param mode - primitive topology for draw operation.
249 /// @param numPrims - number of vertices or indices for draw.
250 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
251 uint32_t GetNumPrims(
252 PRIMITIVE_TOPOLOGY mode,
253 uint32_t numPrims)
254 {
255 switch (mode)
256 {
257 case TOP_POINT_LIST: return numPrims;
258 case TOP_TRIANGLE_LIST: return numPrims / 3;
259 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
260 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
261 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
262 case TOP_QUAD_LIST: return numPrims / 4;
263 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
264 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
265 case TOP_LINE_LIST: return numPrims / 2;
266 case TOP_LINE_LOOP: return numPrims;
267 case TOP_RECT_LIST: return numPrims / 3;
268 case TOP_LINE_LIST_ADJ: return numPrims / 4;
269 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
270 case TOP_TRI_LIST_ADJ: return numPrims / 6;
271 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
272
273 case TOP_PATCHLIST_1:
274 case TOP_PATCHLIST_2:
275 case TOP_PATCHLIST_3:
276 case TOP_PATCHLIST_4:
277 case TOP_PATCHLIST_5:
278 case TOP_PATCHLIST_6:
279 case TOP_PATCHLIST_7:
280 case TOP_PATCHLIST_8:
281 case TOP_PATCHLIST_9:
282 case TOP_PATCHLIST_10:
283 case TOP_PATCHLIST_11:
284 case TOP_PATCHLIST_12:
285 case TOP_PATCHLIST_13:
286 case TOP_PATCHLIST_14:
287 case TOP_PATCHLIST_15:
288 case TOP_PATCHLIST_16:
289 case TOP_PATCHLIST_17:
290 case TOP_PATCHLIST_18:
291 case TOP_PATCHLIST_19:
292 case TOP_PATCHLIST_20:
293 case TOP_PATCHLIST_21:
294 case TOP_PATCHLIST_22:
295 case TOP_PATCHLIST_23:
296 case TOP_PATCHLIST_24:
297 case TOP_PATCHLIST_25:
298 case TOP_PATCHLIST_26:
299 case TOP_PATCHLIST_27:
300 case TOP_PATCHLIST_28:
301 case TOP_PATCHLIST_29:
302 case TOP_PATCHLIST_30:
303 case TOP_PATCHLIST_31:
304 case TOP_PATCHLIST_32:
305 return numPrims / (mode - TOP_PATCHLIST_BASE);
306
307 case TOP_POLYGON:
308 case TOP_POINT_LIST_BF:
309 case TOP_LINE_STRIP_CONT:
310 case TOP_LINE_STRIP_BF:
311 case TOP_LINE_STRIP_CONT_BF:
312 case TOP_TRIANGLE_FAN_NOSTIPPLE:
313 case TOP_TRI_STRIP_REVERSE:
314 case TOP_PATCHLIST_BASE:
315 case TOP_UNKNOWN:
316 SWR_ASSERT(false, "Unsupported topology: %d", mode);
317 return 0;
318 }
319
320 return 0;
321 }
322
323 //////////////////////////////////////////////////////////////////////////
324 /// @brief Computes the number of verts given the number of primitives.
325 /// @param mode - primitive topology for draw operation.
326 /// @param numPrims - number of primitives for draw.
327 uint32_t GetNumVerts(
328 PRIMITIVE_TOPOLOGY mode,
329 uint32_t numPrims)
330 {
331 switch (mode)
332 {
333 case TOP_POINT_LIST: return numPrims;
334 case TOP_TRIANGLE_LIST: return numPrims * 3;
335 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
336 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
337 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
338 case TOP_QUAD_LIST: return numPrims * 4;
339 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
340 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
341 case TOP_LINE_LIST: return numPrims * 2;
342 case TOP_LINE_LOOP: return numPrims;
343 case TOP_RECT_LIST: return numPrims * 3;
344 case TOP_LINE_LIST_ADJ: return numPrims * 4;
345 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
346 case TOP_TRI_LIST_ADJ: return numPrims * 6;
347 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
348
349 case TOP_PATCHLIST_1:
350 case TOP_PATCHLIST_2:
351 case TOP_PATCHLIST_3:
352 case TOP_PATCHLIST_4:
353 case TOP_PATCHLIST_5:
354 case TOP_PATCHLIST_6:
355 case TOP_PATCHLIST_7:
356 case TOP_PATCHLIST_8:
357 case TOP_PATCHLIST_9:
358 case TOP_PATCHLIST_10:
359 case TOP_PATCHLIST_11:
360 case TOP_PATCHLIST_12:
361 case TOP_PATCHLIST_13:
362 case TOP_PATCHLIST_14:
363 case TOP_PATCHLIST_15:
364 case TOP_PATCHLIST_16:
365 case TOP_PATCHLIST_17:
366 case TOP_PATCHLIST_18:
367 case TOP_PATCHLIST_19:
368 case TOP_PATCHLIST_20:
369 case TOP_PATCHLIST_21:
370 case TOP_PATCHLIST_22:
371 case TOP_PATCHLIST_23:
372 case TOP_PATCHLIST_24:
373 case TOP_PATCHLIST_25:
374 case TOP_PATCHLIST_26:
375 case TOP_PATCHLIST_27:
376 case TOP_PATCHLIST_28:
377 case TOP_PATCHLIST_29:
378 case TOP_PATCHLIST_30:
379 case TOP_PATCHLIST_31:
380 case TOP_PATCHLIST_32:
381 return numPrims * (mode - TOP_PATCHLIST_BASE);
382
383 case TOP_POLYGON:
384 case TOP_POINT_LIST_BF:
385 case TOP_LINE_STRIP_CONT:
386 case TOP_LINE_STRIP_BF:
387 case TOP_LINE_STRIP_CONT_BF:
388 case TOP_TRIANGLE_FAN_NOSTIPPLE:
389 case TOP_TRI_STRIP_REVERSE:
390 case TOP_PATCHLIST_BASE:
391 case TOP_UNKNOWN:
392 SWR_ASSERT(false, "Unsupported topology: %d", mode);
393 return 0;
394 }
395
396 return 0;
397 }
398
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Return number of verts per primitive.
401 /// @param topology - topology
402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
403 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
404 {
405 uint32_t numVerts = 0;
406 switch (topology)
407 {
408 case TOP_POINT_LIST:
409 case TOP_POINT_LIST_BF:
410 numVerts = 1;
411 break;
412 case TOP_LINE_LIST:
413 case TOP_LINE_STRIP:
414 case TOP_LINE_LIST_ADJ:
415 case TOP_LINE_LOOP:
416 case TOP_LINE_STRIP_CONT:
417 case TOP_LINE_STRIP_BF:
418 case TOP_LISTSTRIP_ADJ:
419 numVerts = 2;
420 break;
421 case TOP_TRIANGLE_LIST:
422 case TOP_TRIANGLE_STRIP:
423 case TOP_TRIANGLE_FAN:
424 case TOP_TRI_LIST_ADJ:
425 case TOP_TRI_STRIP_ADJ:
426 case TOP_TRI_STRIP_REVERSE:
427 case TOP_RECT_LIST:
428 numVerts = 3;
429 break;
430 case TOP_QUAD_LIST:
431 case TOP_QUAD_STRIP:
432 numVerts = 4;
433 break;
434 case TOP_PATCHLIST_1:
435 case TOP_PATCHLIST_2:
436 case TOP_PATCHLIST_3:
437 case TOP_PATCHLIST_4:
438 case TOP_PATCHLIST_5:
439 case TOP_PATCHLIST_6:
440 case TOP_PATCHLIST_7:
441 case TOP_PATCHLIST_8:
442 case TOP_PATCHLIST_9:
443 case TOP_PATCHLIST_10:
444 case TOP_PATCHLIST_11:
445 case TOP_PATCHLIST_12:
446 case TOP_PATCHLIST_13:
447 case TOP_PATCHLIST_14:
448 case TOP_PATCHLIST_15:
449 case TOP_PATCHLIST_16:
450 case TOP_PATCHLIST_17:
451 case TOP_PATCHLIST_18:
452 case TOP_PATCHLIST_19:
453 case TOP_PATCHLIST_20:
454 case TOP_PATCHLIST_21:
455 case TOP_PATCHLIST_22:
456 case TOP_PATCHLIST_23:
457 case TOP_PATCHLIST_24:
458 case TOP_PATCHLIST_25:
459 case TOP_PATCHLIST_26:
460 case TOP_PATCHLIST_27:
461 case TOP_PATCHLIST_28:
462 case TOP_PATCHLIST_29:
463 case TOP_PATCHLIST_30:
464 case TOP_PATCHLIST_31:
465 case TOP_PATCHLIST_32:
466 numVerts = topology - TOP_PATCHLIST_BASE;
467 break;
468 default:
469 SWR_ASSERT(false, "Unsupported topology: %d", topology);
470 break;
471 }
472
473 if (includeAdjVerts)
474 {
475 switch (topology)
476 {
477 case TOP_LISTSTRIP_ADJ:
478 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
479 case TOP_TRI_STRIP_ADJ:
480 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
481 default: break;
482 }
483 }
484
485 return numVerts;
486 }
487
488 //////////////////////////////////////////////////////////////////////////
489 /// @brief Generate mask from remaining work.
490 /// @param numWorkItems - Number of items being worked on by a SIMD.
491 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
492 {
493 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
494 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
495 return _simd_castps_si(vMask(mask));
496 }
497
498 //////////////////////////////////////////////////////////////////////////
499 /// @brief StreamOut - Streams vertex data out to SO buffers.
500 /// Generally, we are only streaming out a SIMDs worth of triangles.
501 /// @param pDC - pointer to draw context.
502 /// @param workerId - thread's worker id. Even thread has a unique id.
503 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
504 static void StreamOut(
505 DRAW_CONTEXT* pDC,
506 PA_STATE& pa,
507 uint32_t workerId,
508 uint32_t* pPrimData,
509 uint32_t streamIndex)
510 {
511 RDTSC_START(FEStreamout);
512
513 SWR_CONTEXT* pContext = pDC->pContext;
514
515 const API_STATE& state = GetApiState(pDC);
516 const SWR_STREAMOUT_STATE &soState = state.soState;
517
518 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
519
520 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
521 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
522
523 SWR_STREAMOUT_CONTEXT soContext = { 0 };
524
525 // Setup buffer state pointers.
526 for (uint32_t i = 0; i < 4; ++i)
527 {
528 soContext.pBuffer[i] = &state.soBuffer[i];
529 }
530
531 uint32_t numPrims = pa.NumPrims();
532 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
533 {
534 DWORD slot = 0;
535 uint32_t soMask = soState.streamMasks[streamIndex];
536
537 // Write all entries into primitive data buffer for SOS.
538 while (_BitScanForward(&slot, soMask))
539 {
540 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
541 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
542 pa.AssembleSingle(paSlot, primIndex, attrib);
543
544 // Attribute offset is relative offset from start of vertex.
545 // Note that attributes start at slot 1 in the PA buffer. We need to write this
546 // to prim data starting at slot 0. Which is why we do (slot - 1).
547 // Also note: GL works slightly differently, and needs slot 0
548 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
549
550 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
551 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
552 {
553 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
554
555 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
556 }
557 soMask &= ~(1 << slot);
558 }
559
560 // Update pPrimData pointer
561 soContext.pPrimData = pPrimData;
562
563 // Call SOS
564 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
565 state.pfnSoFunc[streamIndex](soContext);
566 }
567
568 // Update SO write offset. The driver provides memory for the update.
569 for (uint32_t i = 0; i < 4; ++i)
570 {
571 if (state.soBuffer[i].pWriteOffset)
572 {
573 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
574 }
575
576 if (state.soBuffer[i].soWriteEnable)
577 {
578 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
579 pDC->dynState.SoWriteOffsetDirty[i] = true;
580 }
581 }
582
583 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
584 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
585
586 RDTSC_STOP(FEStreamout, 1, 0);
587 }
588
589 //////////////////////////////////////////////////////////////////////////
590 /// @brief Computes number of invocations. The current index represents
591 /// the start of the SIMD. The max index represents how much work
592 /// items are remaining. If there is less then a SIMD's left of work
593 /// then return the remaining amount of work.
594 /// @param curIndex - The start index for the SIMD.
595 /// @param maxIndex - The last index for all work items.
596 static INLINE uint32_t GetNumInvocations(
597 uint32_t curIndex,
598 uint32_t maxIndex)
599 {
600 uint32_t remainder = (maxIndex - curIndex);
601 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
602 }
603
604 //////////////////////////////////////////////////////////////////////////
605 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
606 /// The geometry shader will loop over each active streamout buffer, assembling
607 /// primitives for the downstream stages. When multistream output is enabled,
608 /// the generated stream ID buffer from the GS needs to be converted to a cut
609 /// buffer for the primitive assembler.
610 /// @param stream - stream id to generate the cut buffer for
611 /// @param pStreamIdBase - pointer to the stream ID buffer
612 /// @param numEmittedVerts - Number of total verts emitted by the GS
613 /// @param pCutBuffer - output buffer to write cuts to
614 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
615 {
616 SWR_ASSERT(stream < MAX_SO_STREAMS);
617
618 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
619 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
620
621 for (uint32_t b = 0; b < numOutputBytes; ++b)
622 {
623 uint8_t curInputByte = pStreamIdBase[2*b];
624 uint8_t outByte = 0;
625 for (uint32_t i = 0; i < 4; ++i)
626 {
627 if ((curInputByte & 0x3) != stream)
628 {
629 outByte |= (1 << i);
630 }
631 curInputByte >>= 2;
632 }
633
634 curInputByte = pStreamIdBase[2 * b + 1];
635 for (uint32_t i = 0; i < 4; ++i)
636 {
637 if ((curInputByte & 0x3) != stream)
638 {
639 outByte |= (1 << (i + 4));
640 }
641 curInputByte >>= 2;
642 }
643
644 *pCutBuffer++ = outByte;
645 }
646 }
647
648 THREAD SWR_GS_CONTEXT tlsGsContext;
649
650 //////////////////////////////////////////////////////////////////////////
651 /// @brief Implements GS stage.
652 /// @param pDC - pointer to draw context.
653 /// @param workerId - thread's worker id. Even thread has a unique id.
654 /// @param pa - The primitive assembly object.
655 /// @param pGsOut - output stream for GS
656 template <
657 typename HasStreamOutT,
658 typename HasRastT>
659 static void GeometryShaderStage(
660 DRAW_CONTEXT *pDC,
661 uint32_t workerId,
662 PA_STATE& pa,
663 void* pGsOut,
664 void* pCutBuffer,
665 void* pStreamCutBuffer,
666 uint32_t* pSoPrimData,
667 simdscalari primID)
668 {
669 RDTSC_START(FEGeometryShader);
670
671 SWR_CONTEXT* pContext = pDC->pContext;
672
673 const API_STATE& state = GetApiState(pDC);
674 const SWR_GS_STATE* pState = &state.gsState;
675
676 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
677 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
678
679 tlsGsContext.pStream = (uint8_t*)pGsOut;
680 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
681 tlsGsContext.PrimitiveID = primID;
682
683 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
684 simdvector attrib[MAX_ATTRIBUTES];
685
686 // assemble all attributes for the input primitive
687 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
688 {
689 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
690 pa.Assemble(attribSlot, attrib);
691
692 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
693 {
694 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
695 }
696 }
697
698 // assemble position
699 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
700 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
701 {
702 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
703 }
704
705 const uint32_t vertexStride = sizeof(simdvertex);
706 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
707 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
708 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
709 uint32_t cutPrimStride;
710 uint32_t cutInstanceStride;
711
712 if (pState->isSingleStream)
713 {
714 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
715 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
716 }
717 else
718 {
719 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
720 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
721 }
722
723 // record valid prims from the frontend to avoid over binning the newly generated
724 // prims from the GS
725 uint32_t numInputPrims = pa.NumPrims();
726
727 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
728 {
729 tlsGsContext.InstanceID = instance;
730 tlsGsContext.mask = GenerateMask(numInputPrims);
731
732 // execute the geometry shader
733 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
734
735 tlsGsContext.pStream += instanceStride;
736 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
737 }
738
739 // set up new binner and state for the GS output topology
740 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
741 if (HasRastT::value)
742 {
743 switch (pState->outputTopology)
744 {
745 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
746 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
747 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
748 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
749 }
750 }
751
752 // foreach input prim:
753 // - setup a new PA based on the emitted verts for that prim
754 // - loop over the new verts, calling PA to assemble each prim
755 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
756 uint32_t* pPrimitiveId = (uint32_t*)&primID;
757
758 uint32_t totalPrimsGenerated = 0;
759 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
760 {
761 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
762 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
763 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
764 {
765 uint32_t numEmittedVerts = pVertexCount[inputPrim];
766 if (numEmittedVerts == 0)
767 {
768 continue;
769 }
770
771 uint8_t* pBase = pInstanceBase + instance * instanceStride;
772 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
773
774 uint32_t numAttribs = state.feNumAttributes;
775
776 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
777 {
778 bool processCutVerts = false;
779
780 uint8_t* pCutBuffer = pCutBase;
781
782 // assign default stream ID, only relevant when GS is outputting a single stream
783 uint32_t streamID = 0;
784 if (pState->isSingleStream)
785 {
786 processCutVerts = true;
787 streamID = pState->singleStreamID;
788 if (streamID != stream) continue;
789 }
790 else
791 {
792 // early exit if this stream is not enabled for streamout
793 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
794 {
795 continue;
796 }
797
798 // multi-stream output, need to translate StreamID buffer to a cut buffer
799 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
800 pCutBuffer = (uint8_t*)pStreamCutBuffer;
801 processCutVerts = false;
802 }
803
804 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
805
806 while (gsPa.GetNextStreamOutput())
807 {
808 do
809 {
810 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
811
812 if (assemble)
813 {
814 totalPrimsGenerated += gsPa.NumPrims();
815
816 if (HasStreamOutT::value)
817 {
818 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
819 }
820
821 if (HasRastT::value && state.soState.streamToRasterizer == stream)
822 {
823 simdscalari vPrimId;
824 // pull primitiveID from the GS output if available
825 if (state.gsState.emitsPrimitiveID)
826 {
827 simdvector primIdAttrib[3];
828 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
829 vPrimId = _simd_castps_si(primIdAttrib[0].x);
830 }
831 else
832 {
833 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
834 }
835
836 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
837 simdscalari vViewPortIdx;
838 if (state.gsState.emitsViewportArrayIndex)
839 {
840 simdvector vpiAttrib[3];
841 gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
842
843 // OOB indices => forced to zero.
844 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
845 simdscalar vClearMask = _simd_cmplt_ps(vpiAttrib[0].x, _simd_castsi_ps(vNumViewports));
846 vpiAttrib[0].x = _simd_and_ps(vClearMask, vpiAttrib[0].x);
847
848 vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
849 }
850 else
851 {
852 vViewPortIdx = _simd_set1_epi32(0);
853 }
854
855 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
856 }
857 }
858 } while (gsPa.NextPrim());
859 }
860 }
861 }
862 }
863
864 // update GS pipeline stats
865 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
866 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
867
868 RDTSC_STOP(FEGeometryShader, 1, 0);
869 }
870
871 //////////////////////////////////////////////////////////////////////////
872 /// @brief Allocate GS buffers
873 /// @param pDC - pointer to draw context.
874 /// @param state - API state
875 /// @param ppGsOut - pointer to GS output buffer allocation
876 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
877 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
878 void **ppStreamCutBuffer)
879 {
880 auto pArena = pDC->pArena;
881 SWR_ASSERT(pArena != nullptr);
882 SWR_ASSERT(state.gsState.gsEnable);
883 // allocate arena space to hold GS output verts
884 // @todo pack attribs
885 // @todo support multiple streams
886 const uint32_t vertexStride = sizeof(simdvertex);
887 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
888 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
889 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
890
891 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
892 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
893 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
894 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
895
896 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
897 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
898
899 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
900 if (state.gsState.isSingleStream)
901 {
902 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
903 *ppStreamCutBuffer = nullptr;
904 }
905 else
906 {
907 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
908 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
909 }
910
911 }
912
913 //////////////////////////////////////////////////////////////////////////
914 /// @brief Contains all data generated by the HS and passed to the
915 /// tessellator and DS.
916 struct TessellationThreadLocalData
917 {
918 SWR_HS_CONTEXT hsContext;
919 ScalarPatch patchData[KNOB_SIMD_WIDTH];
920 void* pTxCtx;
921 size_t tsCtxSize;
922
923 simdscalar* pDSOutput;
924 size_t numDSOutputVectors;
925 };
926
927 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
928
929 //////////////////////////////////////////////////////////////////////////
930 /// @brief Allocate tessellation data for this worker thread.
931 INLINE
932 static void AllocateTessellationData(SWR_CONTEXT* pContext)
933 {
934 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
935 if (gt_pTessellationThreadData == nullptr)
936 {
937 gt_pTessellationThreadData = (TessellationThreadLocalData*)
938 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
939 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
940 }
941 }
942
943 //////////////////////////////////////////////////////////////////////////
944 /// @brief Implements Tessellation Stages.
945 /// @param pDC - pointer to draw context.
946 /// @param workerId - thread's worker id. Even thread has a unique id.
947 /// @param pa - The primitive assembly object.
948 /// @param pGsOut - output stream for GS
949 template <
950 typename HasGeometryShaderT,
951 typename HasStreamOutT,
952 typename HasRastT>
953 static void TessellationStages(
954 DRAW_CONTEXT *pDC,
955 uint32_t workerId,
956 PA_STATE& pa,
957 void* pGsOut,
958 void* pCutBuffer,
959 void* pCutStreamBuffer,
960 uint32_t* pSoPrimData,
961 simdscalari primID)
962 {
963 const API_STATE& state = GetApiState(pDC);
964 const SWR_TS_STATE& tsState = state.tsState;
965 SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
966
967 SWR_ASSERT(gt_pTessellationThreadData);
968
969 HANDLE tsCtx = TSInitCtx(
970 tsState.domain,
971 tsState.partitioning,
972 tsState.tsOutputTopology,
973 gt_pTessellationThreadData->pTxCtx,
974 gt_pTessellationThreadData->tsCtxSize);
975 if (tsCtx == nullptr)
976 {
977 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
978 tsCtx = TSInitCtx(
979 tsState.domain,
980 tsState.partitioning,
981 tsState.tsOutputTopology,
982 gt_pTessellationThreadData->pTxCtx,
983 gt_pTessellationThreadData->tsCtxSize);
984 }
985 SWR_ASSERT(tsCtx);
986
987 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
988 if (HasRastT::value)
989 {
990 switch (tsState.postDSTopology)
991 {
992 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
993 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
994 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
995 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
996 }
997 }
998
999 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1000 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1001 hsContext.PrimitiveID = primID;
1002
1003 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1004 // Max storage for one attribute for an entire simdprimitive
1005 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1006
1007 // assemble all attributes for the input primitives
1008 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1009 {
1010 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1011 pa.Assemble(attribSlot, simdattrib);
1012
1013 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1014 {
1015 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1016 }
1017 }
1018
1019 #if defined(_DEBUG)
1020 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1021 #endif
1022
1023 uint32_t numPrims = pa.NumPrims();
1024 hsContext.mask = GenerateMask(numPrims);
1025
1026 // Run the HS
1027 RDTSC_START(FEHullShader);
1028 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1029 RDTSC_STOP(FEHullShader, 0, 0);
1030
1031 UPDATE_STAT_FE(HsInvocations, numPrims);
1032
1033 const uint32_t* pPrimId = (const uint32_t*)&primID;
1034
1035 for (uint32_t p = 0; p < numPrims; ++p)
1036 {
1037 // Run Tessellator
1038 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1039 RDTSC_START(FETessellation);
1040 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1041 RDTSC_STOP(FETessellation, 0, 0);
1042
1043 if (tsData.NumPrimitives == 0)
1044 {
1045 continue;
1046 }
1047 SWR_ASSERT(tsData.NumDomainPoints);
1048
1049 // Allocate DS Output memory
1050 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1051 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1052 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1053 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1054 {
1055 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1056 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1057 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1058 }
1059 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1060 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1061
1062 #if defined(_DEBUG)
1063 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1064 #endif
1065
1066 // Run Domain Shader
1067 SWR_DS_CONTEXT dsContext;
1068 dsContext.PrimitiveID = pPrimId[p];
1069 dsContext.pCpIn = &hsContext.pCPout[p];
1070 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1071 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1072 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1073 dsContext.vectorStride = requiredDSVectorInvocations;
1074
1075 uint32_t dsInvocations = 0;
1076
1077 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1078 {
1079 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1080
1081 RDTSC_START(FEDomainShader);
1082 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1083 RDTSC_STOP(FEDomainShader, 0, 0);
1084
1085 dsInvocations += KNOB_SIMD_WIDTH;
1086 }
1087 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1088
1089 PA_TESS tessPa(
1090 pDC,
1091 dsContext.pOutputData,
1092 dsContext.vectorStride,
1093 tsState.numDsOutputAttribs,
1094 tsData.ppIndices,
1095 tsData.NumPrimitives,
1096 tsState.postDSTopology);
1097
1098 while (tessPa.HasWork())
1099 {
1100 if (HasGeometryShaderT::value)
1101 {
1102 GeometryShaderStage<HasStreamOutT, HasRastT>(
1103 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1104 _simd_set1_epi32(dsContext.PrimitiveID));
1105 }
1106 else
1107 {
1108 if (HasStreamOutT::value)
1109 {
1110 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1111 }
1112
1113 if (HasRastT::value)
1114 {
1115 simdvector prim[3]; // Only deal with triangles, lines, or points
1116 RDTSC_START(FEPAAssemble);
1117 #if SWR_ENABLE_ASSERTS
1118 bool assemble =
1119 #endif
1120 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1121 RDTSC_STOP(FEPAAssemble, 1, 0);
1122 SWR_ASSERT(assemble);
1123
1124 SWR_ASSERT(pfnClipFunc);
1125 pfnClipFunc(pDC, tessPa, workerId, prim,
1126 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1127 }
1128 }
1129
1130 tessPa.NextPrim();
1131
1132 } // while (tessPa.HasWork())
1133 } // for (uint32_t p = 0; p < numPrims; ++p)
1134
1135 TSDestroyCtx(tsCtx);
1136 }
1137
1138 //////////////////////////////////////////////////////////////////////////
1139 /// @brief FE handler for SwrDraw.
1140 /// @tparam IsIndexedT - Is indexed drawing enabled
1141 /// @tparam HasTessellationT - Is tessellation enabled
1142 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1143 /// @tparam HasStreamOutT - Is stream-out enabled
1144 /// @tparam HasRastT - Is rasterization enabled
1145 /// @param pContext - pointer to SWR context.
1146 /// @param pDC - pointer to draw context.
1147 /// @param workerId - thread's worker id.
1148 /// @param pUserData - Pointer to DRAW_WORK
1149 template <
1150 typename IsIndexedT,
1151 typename IsCutIndexEnabledT,
1152 typename HasTessellationT,
1153 typename HasGeometryShaderT,
1154 typename HasStreamOutT,
1155 typename HasRastT>
1156 void ProcessDraw(
1157 SWR_CONTEXT *pContext,
1158 DRAW_CONTEXT *pDC,
1159 uint32_t workerId,
1160 void *pUserData)
1161 {
1162
1163 #if KNOB_ENABLE_TOSS_POINTS
1164 if (KNOB_TOSS_QUEUE_FE)
1165 {
1166 return;
1167 }
1168 #endif
1169
1170 RDTSC_START(FEProcessDraw);
1171
1172 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1173 const API_STATE& state = GetApiState(pDC);
1174 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1175 SWR_VS_CONTEXT vsContext;
1176 simdvertex vin;
1177
1178 int indexSize = 0;
1179 uint32_t endVertex = work.numVerts;
1180
1181 const int32_t* pLastRequestedIndex = nullptr;
1182 if (IsIndexedT::value)
1183 {
1184 switch (work.type)
1185 {
1186 case R32_UINT:
1187 indexSize = sizeof(uint32_t);
1188 pLastRequestedIndex = &(work.pIB[endVertex]);
1189 break;
1190 case R16_UINT:
1191 indexSize = sizeof(uint16_t);
1192 // nasty address offset to last index
1193 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1194 break;
1195 case R8_UINT:
1196 indexSize = sizeof(uint8_t);
1197 // nasty address offset to last index
1198 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1199 break;
1200 default:
1201 SWR_ASSERT(0);
1202 }
1203 }
1204 else
1205 {
1206 // No cuts, prune partial primitives.
1207 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1208 }
1209
1210 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1211 fetchInfo.pStreams = &state.vertexBuffers[0];
1212 fetchInfo.StartInstance = work.startInstance;
1213 fetchInfo.StartVertex = 0;
1214
1215 vsContext.pVin = &vin;
1216
1217 if (IsIndexedT::value)
1218 {
1219 fetchInfo.BaseVertex = work.baseVertex;
1220
1221 // if the entire index buffer isn't being consumed, set the last index
1222 // so that fetches < a SIMD wide will be masked off
1223 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1224 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1225 {
1226 fetchInfo.pLastIndex = pLastRequestedIndex;
1227 }
1228 }
1229 else
1230 {
1231 fetchInfo.StartVertex = work.startVertex;
1232 }
1233
1234 #ifdef KNOB_ENABLE_RDTSC
1235 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1236 #endif
1237
1238 void* pGsOut = nullptr;
1239 void* pCutBuffer = nullptr;
1240 void* pStreamCutBuffer = nullptr;
1241 if (HasGeometryShaderT::value)
1242 {
1243 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1244 }
1245
1246 if (HasTessellationT::value)
1247 {
1248 SWR_ASSERT(state.tsState.tsEnable == true);
1249 SWR_ASSERT(state.pfnHsFunc != nullptr);
1250 SWR_ASSERT(state.pfnDsFunc != nullptr);
1251
1252 AllocateTessellationData(pContext);
1253 }
1254 else
1255 {
1256 SWR_ASSERT(state.tsState.tsEnable == false);
1257 SWR_ASSERT(state.pfnHsFunc == nullptr);
1258 SWR_ASSERT(state.pfnDsFunc == nullptr);
1259 }
1260
1261 // allocate space for streamout input prim data
1262 uint32_t* pSoPrimData = nullptr;
1263 if (HasStreamOutT::value)
1264 {
1265 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1266 }
1267
1268 // choose primitive assembler
1269 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1270 PA_STATE& pa = paFactory.GetPA();
1271
1272 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1273 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1274 {
1275 simdscalari vIndex;
1276 uint32_t i = 0;
1277
1278 if (IsIndexedT::value)
1279 {
1280 fetchInfo.pIndices = work.pIB;
1281 }
1282 else
1283 {
1284 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1285 fetchInfo.pIndices = (const int32_t*)&vIndex;
1286 }
1287
1288 fetchInfo.CurInstance = instanceNum;
1289 vsContext.InstanceID = instanceNum;
1290
1291 while (pa.HasWork())
1292 {
1293 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1294 // So we need to keep this outside of (i < endVertex) check.
1295 simdmask* pvCutIndices = nullptr;
1296 if (IsIndexedT::value)
1297 {
1298 pvCutIndices = &pa.GetNextVsIndices();
1299 }
1300
1301 simdvertex& vout = pa.GetNextVsOutput();
1302 vsContext.pVout = &vout;
1303
1304 if (i < endVertex)
1305 {
1306
1307 // 1. Execute FS/VS for a single SIMD.
1308 RDTSC_START(FEFetchShader);
1309 state.pfnFetchFunc(fetchInfo, vin);
1310 RDTSC_STOP(FEFetchShader, 0, 0);
1311
1312 // forward fetch generated vertex IDs to the vertex shader
1313 vsContext.VertexID = fetchInfo.VertexID;
1314
1315 // Setup active mask for vertex shader.
1316 vsContext.mask = GenerateMask(endVertex - i);
1317
1318 // forward cut mask to the PA
1319 if (IsIndexedT::value)
1320 {
1321 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1322 }
1323
1324 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1325
1326 #if KNOB_ENABLE_TOSS_POINTS
1327 if (!KNOB_TOSS_FETCH)
1328 #endif
1329 {
1330 RDTSC_START(FEVertexShader);
1331 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1332 RDTSC_STOP(FEVertexShader, 0, 0);
1333
1334 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1335 }
1336 }
1337
1338 // 2. Assemble primitives given the last two SIMD.
1339 do
1340 {
1341 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1342 // PaAssemble returns false if there is not enough verts to assemble.
1343 RDTSC_START(FEPAAssemble);
1344 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1345 RDTSC_STOP(FEPAAssemble, 1, 0);
1346
1347 #if KNOB_ENABLE_TOSS_POINTS
1348 if (!KNOB_TOSS_FETCH)
1349 #endif
1350 {
1351 #if KNOB_ENABLE_TOSS_POINTS
1352 if (!KNOB_TOSS_VS)
1353 #endif
1354 {
1355 if (assemble)
1356 {
1357 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1358
1359 if (HasTessellationT::value)
1360 {
1361 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1362 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1363 }
1364 else if (HasGeometryShaderT::value)
1365 {
1366 GeometryShaderStage<HasStreamOutT, HasRastT>(
1367 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1368 }
1369 else
1370 {
1371 // If streamout is enabled then stream vertices out to memory.
1372 if (HasStreamOutT::value)
1373 {
1374 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1375 }
1376
1377 if (HasRastT::value)
1378 {
1379 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1380 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1381 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1382 }
1383 }
1384 }
1385 }
1386 }
1387 } while (pa.NextPrim());
1388
1389 i += KNOB_SIMD_WIDTH;
1390 if (IsIndexedT::value)
1391 {
1392 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1393 }
1394 else
1395 {
1396 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1397 }
1398 }
1399 pa.Reset();
1400 }
1401
1402 RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1403 }
1404
1405 struct FEDrawChooser
1406 {
1407 typedef PFN_FE_WORK_FUNC FuncType;
1408
1409 template <typename... ArgsB>
1410 static FuncType GetFunc()
1411 {
1412 return ProcessDraw<ArgsB...>;
1413 }
1414 };
1415
1416
1417 // Selector for correct templated Draw front-end function
1418 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1419 bool IsIndexed,
1420 bool IsCutIndexEnabled,
1421 bool HasTessellation,
1422 bool HasGeometryShader,
1423 bool HasStreamOut,
1424 bool HasRasterization)
1425 {
1426 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1427 }
1428
1429 //////////////////////////////////////////////////////////////////////////
1430 /// @brief Processes attributes for the backend based on linkage mask and
1431 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1432 /// @param pDC - Draw context
1433 /// @param pa - Primitive Assembly state
1434 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1435 /// @param pLinkageMap - maps VS attribute slot to PS slot
1436 /// @param triIndex - Triangle to process attributes for
1437 /// @param pBuffer - Output result
1438 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1439 INLINE void ProcessAttributes(
1440 DRAW_CONTEXT *pDC,
1441 PA_STATE&pa,
1442 uint32_t triIndex,
1443 uint32_t primId,
1444 float *pBuffer)
1445 {
1446 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1447 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1448 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1449 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1450 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1451 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1452
1453 static const float constTable[3][4] = {
1454 {0.0f, 0.0f, 0.0f, 0.0f},
1455 {0.0f, 0.0f, 0.0f, 1.0f},
1456 {1.0f, 1.0f, 1.0f, 1.0f}
1457 };
1458
1459 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1460 {
1461 uint32_t inputSlot;
1462 if (IsSwizzledT::value)
1463 {
1464 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1465 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1466
1467 }
1468 else
1469 {
1470 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1471 }
1472
1473 __m128 attrib[3]; // triangle attribs (always 4 wide)
1474 float* pAttribStart = pBuffer;
1475
1476 if (HasConstantInterpT::value || IsDegenerate::value)
1477 {
1478 if (_bittest(&constantInterpMask, i))
1479 {
1480 uint32_t vid;
1481 uint32_t adjustedTriIndex;
1482 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1483 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1484 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1485 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1486 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1487
1488 switch (topo) {
1489 case TOP_QUAD_LIST:
1490 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1491 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1492 break;
1493 case TOP_QUAD_STRIP:
1494 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1495 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1496 break;
1497 case TOP_TRIANGLE_STRIP:
1498 adjustedTriIndex = triIndex;
1499 vid = (triIndex & 1)
1500 ? tristripProvokingVertex[provokingVertex]
1501 : provokingVertex;
1502 break;
1503 default:
1504 adjustedTriIndex = triIndex;
1505 vid = provokingVertex;
1506 break;
1507 }
1508
1509 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1510
1511 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1512 {
1513 _mm_store_ps(pBuffer, attrib[vid]);
1514 pBuffer += 4;
1515 }
1516 }
1517 else
1518 {
1519 pa.AssembleSingle(inputSlot, triIndex, attrib);
1520
1521 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1522 {
1523 _mm_store_ps(pBuffer, attrib[i]);
1524 pBuffer += 4;
1525 }
1526 }
1527 }
1528 else
1529 {
1530 pa.AssembleSingle(inputSlot, triIndex, attrib);
1531
1532 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1533 {
1534 _mm_store_ps(pBuffer, attrib[i]);
1535 pBuffer += 4;
1536 }
1537 }
1538
1539 // pad out the attrib buffer to 3 verts to ensure the triangle
1540 // interpolation code in the pixel shader works correctly for the
1541 // 3 topologies - point, line, tri. This effectively zeros out the
1542 // effect of the missing vertices in the triangle interpolation.
1543 for (uint32_t v = NumVertsT::value; v < 3; ++v)
1544 {
1545 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1546 pBuffer += 4;
1547 }
1548
1549 // check for constant source overrides
1550 if (IsSwizzledT::value)
1551 {
1552 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1553 if (mask)
1554 {
1555 DWORD comp;
1556 while (_BitScanForward(&comp, mask))
1557 {
1558 mask &= ~(1 << comp);
1559
1560 float constantValue = 0.0f;
1561 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1562 {
1563 case SWR_CONSTANT_SOURCE_CONST_0000:
1564 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1565 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1566 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1567 break;
1568 case SWR_CONSTANT_SOURCE_PRIM_ID:
1569 constantValue = *(float*)&primId;
1570 break;
1571 }
1572
1573 // apply constant value to all 3 vertices
1574 for (uint32_t v = 0; v < 3; ++v)
1575 {
1576 pAttribStart[comp + v * 4] = constantValue;
1577 }
1578 }
1579 }
1580 }
1581 }
1582 }
1583
1584
1585 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1586
1587 struct ProcessAttributesChooser
1588 {
1589 typedef PFN_PROCESS_ATTRIBUTES FuncType;
1590
1591 template <typename... ArgsB>
1592 static FuncType GetFunc()
1593 {
1594 return ProcessAttributes<ArgsB...>;
1595 }
1596 };
1597
1598 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1599 {
1600 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1601 }
1602
1603 //////////////////////////////////////////////////////////////////////////
1604 /// @brief Processes enabled user clip distances. Loads the active clip
1605 /// distances from the PA, sets up barycentric equations, and
1606 /// stores the results to the output buffer
1607 /// @param pa - Primitive Assembly state
1608 /// @param primIndex - primitive index to process
1609 /// @param clipDistMask - mask of enabled clip distances
1610 /// @param pUserClipBuffer - buffer to store results
1611 template<uint32_t NumVerts>
1612 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1613 {
1614 DWORD clipDist;
1615 while (_BitScanForward(&clipDist, clipDistMask))
1616 {
1617 clipDistMask &= ~(1 << clipDist);
1618 uint32_t clipSlot = clipDist >> 2;
1619 uint32_t clipComp = clipDist & 0x3;
1620 uint32_t clipAttribSlot = clipSlot == 0 ?
1621 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1622
1623 __m128 primClipDist[3];
1624 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1625
1626 float vertClipDist[NumVerts];
1627 for (uint32_t e = 0; e < NumVerts; ++e)
1628 {
1629 OSALIGNSIMD(float) aVertClipDist[4];
1630 _mm_store_ps(aVertClipDist, primClipDist[e]);
1631 vertClipDist[e] = aVertClipDist[clipComp];
1632 };
1633
1634 // setup plane equations for barycentric interpolation in the backend
1635 float baryCoeff[NumVerts];
1636 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1637 {
1638 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1639 }
1640 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1641
1642 for (uint32_t e = 0; e < NumVerts; ++e)
1643 {
1644 *(pUserClipBuffer++) = baryCoeff[e];
1645 }
1646 }
1647 }
1648
1649 //////////////////////////////////////////////////////////////////////////
1650 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1651 /// Point precision from FP32.
1652 template <typename PT = FixedPointTraits<Fixed_16_8>>
1653 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1654 {
1655 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1656 return _simd_cvtps_epi32(vFixed);
1657 }
1658
1659 //////////////////////////////////////////////////////////////////////////
1660 /// @brief Helper function to set the X,Y coords of a triangle to the
1661 /// requested Fixed Point precision from FP32.
1662 /// @param tri: simdvector[3] of FP triangle verts
1663 /// @param vXi: fixed point X coords of tri verts
1664 /// @param vYi: fixed point Y coords of tri verts
1665 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1666 {
1667 vXi[0] = fpToFixedPointVertical(tri[0].x);
1668 vYi[0] = fpToFixedPointVertical(tri[0].y);
1669 vXi[1] = fpToFixedPointVertical(tri[1].x);
1670 vYi[1] = fpToFixedPointVertical(tri[1].y);
1671 vXi[2] = fpToFixedPointVertical(tri[2].x);
1672 vYi[2] = fpToFixedPointVertical(tri[2].y);
1673 }
1674
1675 //////////////////////////////////////////////////////////////////////////
1676 /// @brief Calculate bounding box for current triangle
1677 /// @tparam CT: ConservativeRastFETraits type
1678 /// @param vX: fixed point X position for triangle verts
1679 /// @param vY: fixed point Y position for triangle verts
1680 /// @param bbox: fixed point bbox
1681 /// *Note*: expects vX, vY to be in the correct precision for the type
1682 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1683 template <typename CT>
1684 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1685 {
1686 simdscalari vMinX = vX[0];
1687 vMinX = _simd_min_epi32(vMinX, vX[1]);
1688 vMinX = _simd_min_epi32(vMinX, vX[2]);
1689
1690 simdscalari vMaxX = vX[0];
1691 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1692 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1693
1694 simdscalari vMinY = vY[0];
1695 vMinY = _simd_min_epi32(vMinY, vY[1]);
1696 vMinY = _simd_min_epi32(vMinY, vY[2]);
1697
1698 simdscalari vMaxY = vY[0];
1699 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1700 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1701
1702 bbox.left = vMinX;
1703 bbox.right = vMaxX;
1704 bbox.top = vMinY;
1705 bbox.bottom = vMaxY;
1706 }
1707
1708 //////////////////////////////////////////////////////////////////////////
1709 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1710 /// Offsets BBox for conservative rast
1711 template <>
1712 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1713 {
1714 // FE conservative rast traits
1715 typedef FEConservativeRastT CT;
1716
1717 simdscalari vMinX = vX[0];
1718 vMinX = _simd_min_epi32(vMinX, vX[1]);
1719 vMinX = _simd_min_epi32(vMinX, vX[2]);
1720
1721 simdscalari vMaxX = vX[0];
1722 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1723 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1724
1725 simdscalari vMinY = vY[0];
1726 vMinY = _simd_min_epi32(vMinY, vY[1]);
1727 vMinY = _simd_min_epi32(vMinY, vY[2]);
1728
1729 simdscalari vMaxY = vY[0];
1730 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1731 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1732
1733 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1734 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1735 bbox.left = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1736 bbox.right = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1737 bbox.top = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1738 bbox.bottom = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1739 }
1740
1741 //////////////////////////////////////////////////////////////////////////
1742 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1743 /// culling, viewport transform, etc.
1744 /// @param pDC - pointer to draw context.
1745 /// @param pa - The primitive assembly object.
1746 /// @param workerId - thread's worker id. Even thread has a unique id.
1747 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1748 /// @param primID - Primitive ID for each triangle.
1749 /// @param viewportIdx - viewport array index for each triangle.
1750 /// @tparam CT - ConservativeRastFETraits
1751 template <typename CT>
1752 void BinTriangles(
1753 DRAW_CONTEXT *pDC,
1754 PA_STATE& pa,
1755 uint32_t workerId,
1756 simdvector tri[3],
1757 uint32_t triMask,
1758 simdscalari primID,
1759 simdscalari viewportIdx)
1760 {
1761 RDTSC_START(FEBinTriangles);
1762
1763 const API_STATE& state = GetApiState(pDC);
1764 const SWR_RASTSTATE& rastState = state.rastState;
1765 const SWR_FRONTEND_STATE& feState = state.frontendState;
1766 const SWR_GS_STATE& gsState = state.gsState;
1767 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1768
1769
1770 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1771 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1772 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1773
1774 if (!feState.vpTransformDisable)
1775 {
1776 // perspective divide
1777 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1778 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1779 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1780
1781 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1782 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1783 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1784
1785 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1786 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1787 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1788
1789 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1790 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1791 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1792
1793 // viewport transform to screen coords
1794 if (state.gsState.emitsViewportArrayIndex)
1795 {
1796 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1797 }
1798 else
1799 {
1800 viewportTransform<3>(tri, state.vpMatrices);
1801 }
1802 }
1803
1804 // adjust for pixel center location
1805 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1806 tri[0].x = _simd_add_ps(tri[0].x, offset);
1807 tri[0].y = _simd_add_ps(tri[0].y, offset);
1808
1809 tri[1].x = _simd_add_ps(tri[1].x, offset);
1810 tri[1].y = _simd_add_ps(tri[1].y, offset);
1811
1812 tri[2].x = _simd_add_ps(tri[2].x, offset);
1813 tri[2].y = _simd_add_ps(tri[2].y, offset);
1814
1815 simdscalari vXi[3], vYi[3];
1816 // Set vXi, vYi to required fixed point precision
1817 FPToFixedPoint(tri, vXi, vYi);
1818
1819 // triangle setup
1820 simdscalari vAi[3], vBi[3];
1821 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1822
1823 // determinant
1824 simdscalari vDet[2];
1825 calcDeterminantIntVertical(vAi, vBi, vDet);
1826
1827 // cull zero area
1828 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1829 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1830
1831 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1832
1833 uint32_t origTriMask = triMask;
1834 // don't cull degenerate triangles if we're conservatively rasterizing
1835 if(!CT::IsConservativeT::value)
1836 {
1837 triMask &= ~cullZeroAreaMask;
1838 }
1839
1840 // determine front winding tris
1841 // CW +det
1842 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1843 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1844 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1845 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1846
1847 uint32_t frontWindingTris;
1848 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1849 {
1850 frontWindingTris = cwTriMask;
1851 }
1852 else
1853 {
1854 frontWindingTris = ~cwTriMask;
1855 }
1856
1857 // cull
1858 uint32_t cullTris;
1859 switch ((SWR_CULLMODE)rastState.cullMode)
1860 {
1861 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1862 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1863 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1864 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1865 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1866 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1867 }
1868
1869 triMask &= ~cullTris;
1870
1871 if (origTriMask ^ triMask)
1872 {
1873 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1874 }
1875
1876 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1877 // compute per tri backface
1878 uint32_t frontFaceMask = frontWindingTris;
1879 uint32_t *pPrimID = (uint32_t *)&primID;
1880 DWORD triIndex = 0;
1881 // for center sample pattern, all samples are at pixel center; calculate coverage
1882 // once at center and broadcast the results in the backend
1883 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1884 uint32_t edgeEnable;
1885 PFN_WORK_FUNC pfnWork;
1886 if(CT::IsConservativeT::value)
1887 {
1888 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1889 // used to call the appropriate templated rasterizer function
1890 if(cullZeroAreaMask > 0)
1891 {
1892 // e0 = v1-v0
1893 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1894 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1895 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1896
1897 // e1 = v2-v1
1898 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1899 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1900 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1901
1902 // e2 = v0-v2
1903 // if v0 == v1 & v1 == v2, v0 == v2
1904 uint32_t e2Mask = e0Mask & e1Mask;
1905 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1906
1907 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1908 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1909 e0Mask = pdep_u32(e0Mask, 0x00249249);
1910 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1911 e1Mask = pdep_u32(e1Mask, 0x00492492);
1912 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1913 e2Mask = pdep_u32(e2Mask, 0x00924924);
1914
1915 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1916 }
1917 else
1918 {
1919 edgeEnable = 0x00FFFFFF;
1920 }
1921 }
1922 else
1923 {
1924 // degenerate triangles won't be sent to rasterizer; just enable all edges
1925 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1926 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1927 (rastState.scissorEnable > 0));
1928 }
1929
1930 if (!triMask)
1931 {
1932 goto endBinTriangles;
1933 }
1934
1935 // Calc bounding box of triangles
1936 simdBBox bbox;
1937 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1938
1939 // determine if triangle falls between pixel centers and discard
1940 // only discard for non-MSAA case and when conservative rast is disabled
1941 // (left + 127) & ~255
1942 // (right + 128) & ~255
1943 if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1944 {
1945 origTriMask = triMask;
1946
1947 int cullCenterMask;
1948 {
1949 simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1950 left = _simd_and_si(left, _simd_set1_epi32(~255));
1951 simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1952 right = _simd_and_si(right, _simd_set1_epi32(~255));
1953
1954 simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1955
1956 simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1957 top = _simd_and_si(top, _simd_set1_epi32(~255));
1958 simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1959 bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1960
1961 simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1962 vMaskV = _simd_or_si(vMaskH, vMaskV);
1963 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1964 }
1965
1966 triMask &= ~cullCenterMask;
1967
1968 if(origTriMask ^ triMask)
1969 {
1970 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1971 }
1972 }
1973
1974 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1975 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1976 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1977 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1978 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1979
1980 if(CT::IsConservativeT::value)
1981 {
1982 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1983 // some area. Bump the right/bottom edges out
1984 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.top, bbox.bottom);
1985 bbox.bottom = _simd_blendv_epi32(bbox.bottom, _simd_add_epi32(bbox.bottom, _simd_set1_epi32(1)), topEqualsBottom);
1986 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.left, bbox.right);
1987 bbox.right = _simd_blendv_epi32(bbox.right, _simd_add_epi32(bbox.right, _simd_set1_epi32(1)), leftEqualsRight);
1988 }
1989
1990 // Cull tris completely outside scissor
1991 {
1992 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1993 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1994 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1995 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1996 triMask = triMask & ~maskOutsideScissor;
1997 }
1998
1999 if (!triMask)
2000 {
2001 goto endBinTriangles;
2002 }
2003
2004 // Convert triangle bbox to macrotile units.
2005 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2006 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2007 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2008 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2009
2010 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2011 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2012 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2013 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2014 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2015
2016 // transpose verts needed for backend
2017 /// @todo modify BE to take non-transformed verts
2018 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2019 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2020 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2021 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2022 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2023
2024 // store render target array index
2025 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2026 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2027 {
2028 simdvector vRtai[3];
2029 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2030 simdscalari vRtaii;
2031 vRtaii = _simd_castps_si(vRtai[0].x);
2032 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2033 }
2034 else
2035 {
2036 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2037 }
2038
2039 // scan remaining valid triangles and bin each separately
2040 while (_BitScanForward(&triIndex, triMask))
2041 {
2042 uint32_t linkageCount = state.backendState.numAttributes;
2043 uint32_t numScalarAttribs = linkageCount * 4;
2044
2045 BE_WORK work;
2046 work.type = DRAW;
2047
2048 bool isDegenerate;
2049 if(CT::IsConservativeT::value)
2050 {
2051 // only rasterize valid edges if we have a degenerate primitive
2052 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2053 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2054 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2055 (rastState.scissorEnable > 0));
2056
2057 // Degenerate triangles are required to be constant interpolated
2058 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2059 }
2060 else
2061 {
2062 isDegenerate = false;
2063 work.pfnWork = pfnWork;
2064 }
2065
2066 // Select attribute processor
2067 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2068 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
2069
2070 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2071
2072 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2073 desc.triFlags.primID = pPrimID[triIndex];
2074 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2075
2076 auto pArena = pDC->pArena;
2077 SWR_ASSERT(pArena != nullptr);
2078
2079 // store active attribs
2080 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2081 desc.pAttribs = pAttribs;
2082 desc.numAttribs = linkageCount;
2083 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2084
2085 // store triangle vertex data
2086 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2087
2088 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2089 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2090 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2091 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2092
2093 // store user clip distances
2094 if (rastState.clipDistanceMask)
2095 {
2096 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2097 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2098 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2099 }
2100
2101 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2102 {
2103 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2104 {
2105 #if KNOB_ENABLE_TOSS_POINTS
2106 if (!KNOB_TOSS_SETUP_TRIS)
2107 #endif
2108 {
2109 pTileMgr->enqueue(x, y, &work);
2110 }
2111 }
2112 }
2113 triMask &= ~(1 << triIndex);
2114 }
2115
2116 endBinTriangles:
2117 RDTSC_STOP(FEBinTriangles, 1, 0);
2118 }
2119
2120 struct FEBinTrianglesChooser
2121 {
2122 typedef PFN_PROCESS_PRIMS FuncType;
2123
2124 template <typename... ArgsB>
2125 static FuncType GetFunc()
2126 {
2127 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2128 }
2129 };
2130
2131 // Selector for correct templated BinTrinagles function
2132 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2133 {
2134 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2135 }
2136
2137 //////////////////////////////////////////////////////////////////////////
2138 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2139 /// @param pDC - pointer to draw context.
2140 /// @param pa - The primitive assembly object.
2141 /// @param workerId - thread's worker id. Even thread has a unique id.
2142 /// @param tri - Contains point position data for SIMDs worth of points.
2143 /// @param primID - Primitive ID for each point.
2144 void BinPoints(
2145 DRAW_CONTEXT *pDC,
2146 PA_STATE& pa,
2147 uint32_t workerId,
2148 simdvector prim[3],
2149 uint32_t primMask,
2150 simdscalari primID,
2151 simdscalari viewportIdx)
2152 {
2153 RDTSC_START(FEBinPoints);
2154
2155 simdvector& primVerts = prim[0];
2156
2157 const API_STATE& state = GetApiState(pDC);
2158 const SWR_FRONTEND_STATE& feState = state.frontendState;
2159 const SWR_GS_STATE& gsState = state.gsState;
2160 const SWR_RASTSTATE& rastState = state.rastState;
2161
2162 // Select attribute processor
2163 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2164 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2165
2166 if (!feState.vpTransformDisable)
2167 {
2168 // perspective divide
2169 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2170 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2171 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2172 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2173
2174 // viewport transform to screen coords
2175 if (state.gsState.emitsViewportArrayIndex)
2176 {
2177 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2178 }
2179 else
2180 {
2181 viewportTransform<1>(&primVerts, state.vpMatrices);
2182 }
2183 }
2184
2185 // adjust for pixel center location
2186 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2187 primVerts.x = _simd_add_ps(primVerts.x, offset);
2188 primVerts.y = _simd_add_ps(primVerts.y, offset);
2189
2190 // convert to fixed point
2191 simdscalari vXi, vYi;
2192 vXi = fpToFixedPointVertical(primVerts.x);
2193 vYi = fpToFixedPointVertical(primVerts.y);
2194
2195 if (CanUseSimplePoints(pDC))
2196 {
2197 // adjust for top-left rule
2198 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2199 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2200
2201 // cull points off the top-left edge of the viewport
2202 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2203 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2204
2205 // compute macro tile coordinates
2206 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2207 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2208
2209 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2210 _simd_store_si((simdscalari*)aMacroX, macroX);
2211 _simd_store_si((simdscalari*)aMacroY, macroY);
2212
2213 // compute raster tile coordinates
2214 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2215 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2216
2217 // compute raster tile relative x,y for coverage mask
2218 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2219 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2220
2221 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2222 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2223
2224 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2225 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2226 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2227 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2228
2229 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2230 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2231 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2232 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2233
2234 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2235 _simd_store_ps((float*)aZ, primVerts.z);
2236
2237 // store render target array index
2238 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2239 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2240 {
2241 simdvector vRtai;
2242 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2243 simdscalari vRtaii = _simd_castps_si(vRtai.x);
2244 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2245 }
2246 else
2247 {
2248 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2249 }
2250
2251 uint32_t *pPrimID = (uint32_t *)&primID;
2252 DWORD primIndex = 0;
2253
2254 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2255
2256 // scan remaining valid triangles and bin each separately
2257 while (_BitScanForward(&primIndex, primMask))
2258 {
2259 uint32_t linkageCount = backendState.numAttributes;
2260 uint32_t numScalarAttribs = linkageCount * 4;
2261
2262 BE_WORK work;
2263 work.type = DRAW;
2264
2265 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2266
2267 // points are always front facing
2268 desc.triFlags.frontFacing = 1;
2269 desc.triFlags.primID = pPrimID[primIndex];
2270 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2271
2272 work.pfnWork = RasterizeSimplePoint;
2273
2274 auto pArena = pDC->pArena;
2275 SWR_ASSERT(pArena != nullptr);
2276
2277 // store attributes
2278 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2279 desc.pAttribs = pAttribs;
2280 desc.numAttribs = linkageCount;
2281
2282 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2283
2284 // store raster tile aligned x, y, perspective correct z
2285 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2286 desc.pTriBuffer = pTriBuffer;
2287 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2288 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2289 *pTriBuffer = aZ[primIndex];
2290
2291 uint32_t tX = aTileRelativeX[primIndex];
2292 uint32_t tY = aTileRelativeY[primIndex];
2293
2294 // pack the relative x,y into the coverageMask, the rasterizer will
2295 // generate the true coverage mask from it
2296 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2297
2298 // bin it
2299 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2300 #if KNOB_ENABLE_TOSS_POINTS
2301 if (!KNOB_TOSS_SETUP_TRIS)
2302 #endif
2303 {
2304 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2305 }
2306 primMask &= ~(1 << primIndex);
2307 }
2308 }
2309 else
2310 {
2311 // non simple points need to be potentially binned to multiple macro tiles
2312 simdscalar vPointSize;
2313 if (rastState.pointParam)
2314 {
2315 simdvector size[3];
2316 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2317 vPointSize = size[0].x;
2318 }
2319 else
2320 {
2321 vPointSize = _simd_set1_ps(rastState.pointSize);
2322 }
2323
2324 // bloat point to bbox
2325 simdBBox bbox;
2326 bbox.left = bbox.right = vXi;
2327 bbox.top = bbox.bottom = vYi;
2328
2329 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2330 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2331 bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2332 bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2333 bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2334 bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2335
2336 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2337 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2338 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2339 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2340 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2341
2342 // Cull bloated points completely outside scissor
2343 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2344 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2345 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2346 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2347 primMask = primMask & ~maskOutsideScissor;
2348
2349 // Convert bbox to macrotile units.
2350 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2351 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2352 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2353 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2354
2355 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2356 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2357 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2358 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2359 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2360
2361 // store render target array index
2362 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2363 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2364 {
2365 simdvector vRtai[2];
2366 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2367 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2368 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2369 }
2370 else
2371 {
2372 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2373 }
2374
2375 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2376 _simd_store_ps((float*)aPointSize, vPointSize);
2377
2378 uint32_t *pPrimID = (uint32_t *)&primID;
2379
2380 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2381 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2382 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2383
2384 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2385 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2386 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2387
2388 // scan remaining valid prims and bin each separately
2389 const SWR_BACKEND_STATE& backendState = state.backendState;
2390 DWORD primIndex;
2391 while (_BitScanForward(&primIndex, primMask))
2392 {
2393 uint32_t linkageCount = backendState.numAttributes;
2394 uint32_t numScalarAttribs = linkageCount * 4;
2395
2396 BE_WORK work;
2397 work.type = DRAW;
2398
2399 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2400
2401 desc.triFlags.frontFacing = 1;
2402 desc.triFlags.primID = pPrimID[primIndex];
2403 desc.triFlags.pointSize = aPointSize[primIndex];
2404 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2405
2406 work.pfnWork = RasterizeTriPoint;
2407
2408 auto pArena = pDC->pArena;
2409 SWR_ASSERT(pArena != nullptr);
2410
2411 // store active attribs
2412 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2413 desc.numAttribs = linkageCount;
2414 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2415
2416 // store point vertex data
2417 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2418 desc.pTriBuffer = pTriBuffer;
2419 *pTriBuffer++ = aPrimVertsX[primIndex];
2420 *pTriBuffer++ = aPrimVertsY[primIndex];
2421 *pTriBuffer = aPrimVertsZ[primIndex];
2422
2423 // store user clip distances
2424 if (rastState.clipDistanceMask)
2425 {
2426 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2427 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2428 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2429 }
2430
2431 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2432 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2433 {
2434 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2435 {
2436 #if KNOB_ENABLE_TOSS_POINTS
2437 if (!KNOB_TOSS_SETUP_TRIS)
2438 #endif
2439 {
2440 pTileMgr->enqueue(x, y, &work);
2441 }
2442 }
2443 }
2444
2445 primMask &= ~(1 << primIndex);
2446 }
2447 }
2448
2449
2450
2451
2452 RDTSC_STOP(FEBinPoints, 1, 0);
2453 }
2454
2455 //////////////////////////////////////////////////////////////////////////
2456 /// @brief Bin SIMD lines to the backend.
2457 /// @param pDC - pointer to draw context.
2458 /// @param pa - The primitive assembly object.
2459 /// @param workerId - thread's worker id. Even thread has a unique id.
2460 /// @param tri - Contains line position data for SIMDs worth of points.
2461 /// @param primID - Primitive ID for each line.
2462 void BinLines(
2463 DRAW_CONTEXT *pDC,
2464 PA_STATE& pa,
2465 uint32_t workerId,
2466 simdvector prim[],
2467 uint32_t primMask,
2468 simdscalari primID,
2469 simdscalari viewportIdx)
2470 {
2471 RDTSC_START(FEBinLines);
2472
2473 const API_STATE& state = GetApiState(pDC);
2474 const SWR_RASTSTATE& rastState = state.rastState;
2475 const SWR_FRONTEND_STATE& feState = state.frontendState;
2476 const SWR_GS_STATE& gsState = state.gsState;
2477
2478 // Select attribute processor
2479 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2480 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2481
2482 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2483 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2484
2485 if (!feState.vpTransformDisable)
2486 {
2487 // perspective divide
2488 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2489 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2490
2491 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2492 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2493
2494 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2495 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2496
2497 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2498 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2499
2500 // viewport transform to screen coords
2501 if (state.gsState.emitsViewportArrayIndex)
2502 {
2503 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2504 }
2505 else
2506 {
2507 viewportTransform<2>(prim, state.vpMatrices);
2508 }
2509 }
2510
2511 // adjust for pixel center location
2512 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2513 prim[0].x = _simd_add_ps(prim[0].x, offset);
2514 prim[0].y = _simd_add_ps(prim[0].y, offset);
2515
2516 prim[1].x = _simd_add_ps(prim[1].x, offset);
2517 prim[1].y = _simd_add_ps(prim[1].y, offset);
2518
2519 // convert to fixed point
2520 simdscalari vXi[2], vYi[2];
2521 vXi[0] = fpToFixedPointVertical(prim[0].x);
2522 vYi[0] = fpToFixedPointVertical(prim[0].y);
2523 vXi[1] = fpToFixedPointVertical(prim[1].x);
2524 vYi[1] = fpToFixedPointVertical(prim[1].y);
2525
2526 // compute x-major vs y-major mask
2527 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2528 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2529 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2530 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2531
2532 // cull zero-length lines
2533 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2534 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2535
2536 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2537
2538 uint32_t *pPrimID = (uint32_t *)&primID;
2539
2540 simdscalar vUnused = _simd_setzero_ps();
2541
2542 // Calc bounding box of lines
2543 simdBBox bbox;
2544 bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2545 bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2546 bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2547 bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2548
2549 // bloat bbox by line width along minor axis
2550 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2551 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2552 simdBBox bloatBox;
2553 bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2554 bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2555 bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2556 bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2557
2558 bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2559 bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2560 bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2561 bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2562
2563 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2564 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2565 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2566 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2567 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2568
2569 // Cull prims completely outside scissor
2570 {
2571 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2572 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2573 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2574 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2575 primMask = primMask & ~maskOutsideScissor;
2576 }
2577
2578 if (!primMask)
2579 {
2580 goto endBinLines;
2581 }
2582
2583 // Convert triangle bbox to macrotile units.
2584 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2585 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2586 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2587 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2588
2589 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2590 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2591 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2592 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2593 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2594
2595 // transpose verts needed for backend
2596 /// @todo modify BE to take non-transformed verts
2597 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2598 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2599 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2600 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2601 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2602
2603 // store render target array index
2604 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2605 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2606 {
2607 simdvector vRtai[2];
2608 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2609 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2610 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2611 }
2612 else
2613 {
2614 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2615 }
2616
2617 // scan remaining valid prims and bin each separately
2618 DWORD primIndex;
2619 while (_BitScanForward(&primIndex, primMask))
2620 {
2621 uint32_t linkageCount = state.backendState.numAttributes;
2622 uint32_t numScalarAttribs = linkageCount * 4;
2623
2624 BE_WORK work;
2625 work.type = DRAW;
2626
2627 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2628
2629 desc.triFlags.frontFacing = 1;
2630 desc.triFlags.primID = pPrimID[primIndex];
2631 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2632 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2633
2634 work.pfnWork = RasterizeLine;
2635
2636 auto pArena = pDC->pArena;
2637 SWR_ASSERT(pArena != nullptr);
2638
2639 // store active attribs
2640 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2641 desc.numAttribs = linkageCount;
2642 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2643
2644 // store line vertex data
2645 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2646 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2647 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2648 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2649 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2650
2651 // store user clip distances
2652 if (rastState.clipDistanceMask)
2653 {
2654 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2655 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2656 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2657 }
2658
2659 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2660 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2661 {
2662 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2663 {
2664 #if KNOB_ENABLE_TOSS_POINTS
2665 if (!KNOB_TOSS_SETUP_TRIS)
2666 #endif
2667 {
2668 pTileMgr->enqueue(x, y, &work);
2669 }
2670 }
2671 }
2672
2673 primMask &= ~(1 << primIndex);
2674 }
2675
2676 endBinLines:
2677
2678 RDTSC_STOP(FEBinLines, 1, 0);
2679 }