swr: [rasterizer core] improve implementation for SoWriteOffset
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
37 #include "utils.h"
38 #include "threads.h"
39 #include "pa.h"
40 #include "clip.h"
41 #include "tilemgr.h"
42 #include "tessellator.h"
43 #include <limits>
44
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE uint32_t GenMask(uint32_t numBits)
48 {
49 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
55 /// raster state.
56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
69 void ProcessSync(
70 SWR_CONTEXT *pContext,
71 DRAW_CONTEXT *pDC,
72 uint32_t workerId,
73 void *pUserData)
74 {
75 BE_WORK work;
76 work.type = SYNC;
77 work.pfnWork = ProcessSyncBE;
78
79 MacroTileMgr *pTileMgr = pDC->pTileMgr;
80 pTileMgr->enqueue(0, 0, &work);
81 }
82
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrGetStats.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to stats callback.
89 /// @todo This should go away when we switch this to use compute threading.
90 void ProcessQueryStats(
91 SWR_CONTEXT *pContext,
92 DRAW_CONTEXT *pDC,
93 uint32_t workerId,
94 void *pUserData)
95 {
96 QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
97 BE_WORK work;
98 work.type = QUERYSTATS;
99 work.pfnWork = ProcessQueryStatsBE;
100 work.desc.queryStats = *pQueryStats;
101
102 MacroTileMgr *pTileMgr = pDC->pTileMgr;
103 pTileMgr->enqueue(0, 0, &work);
104 }
105
106 //////////////////////////////////////////////////////////////////////////
107 /// @brief FE handler for SwrClearRenderTarget.
108 /// @param pContext - pointer to SWR context.
109 /// @param pDC - pointer to draw context.
110 /// @param workerId - thread's worker id. Even thread has a unique id.
111 /// @param pUserData - Pointer to user data passed back to clear callback.
112 /// @todo This should go away when we switch this to use compute threading.
113 void ProcessClear(
114 SWR_CONTEXT *pContext,
115 DRAW_CONTEXT *pDC,
116 uint32_t workerId,
117 void *pUserData)
118 {
119 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
120 MacroTileMgr *pTileMgr = pDC->pTileMgr;
121
122 const API_STATE& state = GetApiState(pDC);
123
124 // queue a clear to each macro tile
125 // compute macro tile bounds for the current scissor/viewport
126 uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
127 uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
128 uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
129 uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
130
131 BE_WORK work;
132 work.type = CLEAR;
133 work.pfnWork = ProcessClearBE;
134 work.desc.clear = *pClear;
135
136 for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
137 {
138 for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
139 {
140 pTileMgr->enqueue(x, y, &work);
141 }
142 }
143 }
144
145 //////////////////////////////////////////////////////////////////////////
146 /// @brief FE handler for SwrStoreTiles.
147 /// @param pContext - pointer to SWR context.
148 /// @param pDC - pointer to draw context.
149 /// @param workerId - thread's worker id. Even thread has a unique id.
150 /// @param pUserData - Pointer to user data passed back to callback.
151 /// @todo This should go away when we switch this to use compute threading.
152 void ProcessStoreTiles(
153 SWR_CONTEXT *pContext,
154 DRAW_CONTEXT *pDC,
155 uint32_t workerId,
156 void *pUserData)
157 {
158 RDTSC_START(FEProcessStoreTiles);
159 STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
160 MacroTileMgr *pTileMgr = pDC->pTileMgr;
161
162 const API_STATE& state = GetApiState(pDC);
163
164 // queue a store to each macro tile
165 // compute macro tile bounds for the current render target
166 const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
167 const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
168
169 uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
170 uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
171
172 // store tiles
173 BE_WORK work;
174 work.type = STORETILES;
175 work.pfnWork = ProcessStoreTileBE;
176 work.desc.storeTiles = *pStore;
177
178 for (uint32_t x = 0; x < numMacroTilesX; ++x)
179 {
180 for (uint32_t y = 0; y < numMacroTilesY; ++y)
181 {
182 pTileMgr->enqueue(x, y, &work);
183 }
184 }
185
186 RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
187 }
188
189 //////////////////////////////////////////////////////////////////////////
190 /// @brief FE handler for SwrInvalidateTiles.
191 /// @param pContext - pointer to SWR context.
192 /// @param pDC - pointer to draw context.
193 /// @param workerId - thread's worker id. Even thread has a unique id.
194 /// @param pUserData - Pointer to user data passed back to callback.
195 /// @todo This should go away when we switch this to use compute threading.
196 void ProcessDiscardInvalidateTiles(
197 SWR_CONTEXT *pContext,
198 DRAW_CONTEXT *pDC,
199 uint32_t workerId,
200 void *pUserData)
201 {
202 RDTSC_START(FEProcessInvalidateTiles);
203 DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
204 MacroTileMgr *pTileMgr = pDC->pTileMgr;
205
206 SWR_RECT rect;
207
208 if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
209 {
210 // Valid rect
211 rect = pInv->rect;
212 }
213 else
214 {
215 // Use viewport dimensions
216 const API_STATE& state = GetApiState(pDC);
217
218 rect.left = (uint32_t)state.vp[0].x;
219 rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
220 rect.top = (uint32_t)state.vp[0].y;
221 rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
222 }
223
224 // queue a store to each macro tile
225 // compute macro tile bounds for the current render target
226 uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
227 uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
228
229 // Setup region assuming full tiles
230 uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
231 uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
232
233 uint32_t macroTileEndX = rect.right / macroWidth;
234 uint32_t macroTileEndY = rect.bottom / macroHeight;
235
236 if (pInv->fullTilesOnly == false)
237 {
238 // include partial tiles
239 macroTileStartX = rect.left / macroWidth;
240 macroTileStartY = rect.top / macroHeight;
241
242 macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
243 macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
244 }
245
246 SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
247 SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
248
249 macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
250 macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
251
252 // load tiles
253 BE_WORK work;
254 work.type = DISCARDINVALIDATETILES;
255 work.pfnWork = ProcessDiscardInvalidateTilesBE;
256 work.desc.discardInvalidateTiles = *pInv;
257
258 for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
259 {
260 for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
261 {
262 pTileMgr->enqueue(x, y, &work);
263 }
264 }
265
266 RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
267 }
268
269 //////////////////////////////////////////////////////////////////////////
270 /// @brief Computes the number of primitives given the number of verts.
271 /// @param mode - primitive topology for draw operation.
272 /// @param numPrims - number of vertices or indices for draw.
273 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
274 uint32_t GetNumPrims(
275 PRIMITIVE_TOPOLOGY mode,
276 uint32_t numPrims)
277 {
278 switch (mode)
279 {
280 case TOP_POINT_LIST: return numPrims;
281 case TOP_TRIANGLE_LIST: return numPrims / 3;
282 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
283 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
284 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
285 case TOP_QUAD_LIST: return numPrims / 4;
286 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
287 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
288 case TOP_LINE_LIST: return numPrims / 2;
289 case TOP_LINE_LOOP: return numPrims;
290 case TOP_RECT_LIST: return numPrims / 3;
291 case TOP_LINE_LIST_ADJ: return numPrims / 4;
292 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
293 case TOP_TRI_LIST_ADJ: return numPrims / 6;
294 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
295
296 case TOP_PATCHLIST_1:
297 case TOP_PATCHLIST_2:
298 case TOP_PATCHLIST_3:
299 case TOP_PATCHLIST_4:
300 case TOP_PATCHLIST_5:
301 case TOP_PATCHLIST_6:
302 case TOP_PATCHLIST_7:
303 case TOP_PATCHLIST_8:
304 case TOP_PATCHLIST_9:
305 case TOP_PATCHLIST_10:
306 case TOP_PATCHLIST_11:
307 case TOP_PATCHLIST_12:
308 case TOP_PATCHLIST_13:
309 case TOP_PATCHLIST_14:
310 case TOP_PATCHLIST_15:
311 case TOP_PATCHLIST_16:
312 case TOP_PATCHLIST_17:
313 case TOP_PATCHLIST_18:
314 case TOP_PATCHLIST_19:
315 case TOP_PATCHLIST_20:
316 case TOP_PATCHLIST_21:
317 case TOP_PATCHLIST_22:
318 case TOP_PATCHLIST_23:
319 case TOP_PATCHLIST_24:
320 case TOP_PATCHLIST_25:
321 case TOP_PATCHLIST_26:
322 case TOP_PATCHLIST_27:
323 case TOP_PATCHLIST_28:
324 case TOP_PATCHLIST_29:
325 case TOP_PATCHLIST_30:
326 case TOP_PATCHLIST_31:
327 case TOP_PATCHLIST_32:
328 return numPrims / (mode - TOP_PATCHLIST_BASE);
329
330 case TOP_POLYGON:
331 case TOP_POINT_LIST_BF:
332 case TOP_LINE_STRIP_CONT:
333 case TOP_LINE_STRIP_BF:
334 case TOP_LINE_STRIP_CONT_BF:
335 case TOP_TRIANGLE_FAN_NOSTIPPLE:
336 case TOP_TRI_STRIP_REVERSE:
337 case TOP_PATCHLIST_BASE:
338 case TOP_UNKNOWN:
339 SWR_ASSERT(false, "Unsupported topology: %d", mode);
340 return 0;
341 }
342
343 return 0;
344 }
345
346 //////////////////////////////////////////////////////////////////////////
347 /// @brief Computes the number of verts given the number of primitives.
348 /// @param mode - primitive topology for draw operation.
349 /// @param numPrims - number of primitives for draw.
350 uint32_t GetNumVerts(
351 PRIMITIVE_TOPOLOGY mode,
352 uint32_t numPrims)
353 {
354 switch (mode)
355 {
356 case TOP_POINT_LIST: return numPrims;
357 case TOP_TRIANGLE_LIST: return numPrims * 3;
358 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
359 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
360 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
361 case TOP_QUAD_LIST: return numPrims * 4;
362 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
363 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
364 case TOP_LINE_LIST: return numPrims * 2;
365 case TOP_LINE_LOOP: return numPrims;
366 case TOP_RECT_LIST: return numPrims * 3;
367 case TOP_LINE_LIST_ADJ: return numPrims * 4;
368 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
369 case TOP_TRI_LIST_ADJ: return numPrims * 6;
370 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
371
372 case TOP_PATCHLIST_1:
373 case TOP_PATCHLIST_2:
374 case TOP_PATCHLIST_3:
375 case TOP_PATCHLIST_4:
376 case TOP_PATCHLIST_5:
377 case TOP_PATCHLIST_6:
378 case TOP_PATCHLIST_7:
379 case TOP_PATCHLIST_8:
380 case TOP_PATCHLIST_9:
381 case TOP_PATCHLIST_10:
382 case TOP_PATCHLIST_11:
383 case TOP_PATCHLIST_12:
384 case TOP_PATCHLIST_13:
385 case TOP_PATCHLIST_14:
386 case TOP_PATCHLIST_15:
387 case TOP_PATCHLIST_16:
388 case TOP_PATCHLIST_17:
389 case TOP_PATCHLIST_18:
390 case TOP_PATCHLIST_19:
391 case TOP_PATCHLIST_20:
392 case TOP_PATCHLIST_21:
393 case TOP_PATCHLIST_22:
394 case TOP_PATCHLIST_23:
395 case TOP_PATCHLIST_24:
396 case TOP_PATCHLIST_25:
397 case TOP_PATCHLIST_26:
398 case TOP_PATCHLIST_27:
399 case TOP_PATCHLIST_28:
400 case TOP_PATCHLIST_29:
401 case TOP_PATCHLIST_30:
402 case TOP_PATCHLIST_31:
403 case TOP_PATCHLIST_32:
404 return numPrims * (mode - TOP_PATCHLIST_BASE);
405
406 case TOP_POLYGON:
407 case TOP_POINT_LIST_BF:
408 case TOP_LINE_STRIP_CONT:
409 case TOP_LINE_STRIP_BF:
410 case TOP_LINE_STRIP_CONT_BF:
411 case TOP_TRIANGLE_FAN_NOSTIPPLE:
412 case TOP_TRI_STRIP_REVERSE:
413 case TOP_PATCHLIST_BASE:
414 case TOP_UNKNOWN:
415 SWR_ASSERT(false, "Unsupported topology: %d", mode);
416 return 0;
417 }
418
419 return 0;
420 }
421
422 //////////////////////////////////////////////////////////////////////////
423 /// @brief Return number of verts per primitive.
424 /// @param topology - topology
425 /// @param includeAdjVerts - include adjacent verts in primitive vertices
426 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
427 {
428 uint32_t numVerts = 0;
429 switch (topology)
430 {
431 case TOP_POINT_LIST:
432 case TOP_POINT_LIST_BF:
433 numVerts = 1;
434 break;
435 case TOP_LINE_LIST:
436 case TOP_LINE_STRIP:
437 case TOP_LINE_LIST_ADJ:
438 case TOP_LINE_LOOP:
439 case TOP_LINE_STRIP_CONT:
440 case TOP_LINE_STRIP_BF:
441 case TOP_LISTSTRIP_ADJ:
442 numVerts = 2;
443 break;
444 case TOP_TRIANGLE_LIST:
445 case TOP_TRIANGLE_STRIP:
446 case TOP_TRIANGLE_FAN:
447 case TOP_TRI_LIST_ADJ:
448 case TOP_TRI_STRIP_ADJ:
449 case TOP_TRI_STRIP_REVERSE:
450 case TOP_RECT_LIST:
451 numVerts = 3;
452 break;
453 case TOP_QUAD_LIST:
454 case TOP_QUAD_STRIP:
455 numVerts = 4;
456 break;
457 case TOP_PATCHLIST_1:
458 case TOP_PATCHLIST_2:
459 case TOP_PATCHLIST_3:
460 case TOP_PATCHLIST_4:
461 case TOP_PATCHLIST_5:
462 case TOP_PATCHLIST_6:
463 case TOP_PATCHLIST_7:
464 case TOP_PATCHLIST_8:
465 case TOP_PATCHLIST_9:
466 case TOP_PATCHLIST_10:
467 case TOP_PATCHLIST_11:
468 case TOP_PATCHLIST_12:
469 case TOP_PATCHLIST_13:
470 case TOP_PATCHLIST_14:
471 case TOP_PATCHLIST_15:
472 case TOP_PATCHLIST_16:
473 case TOP_PATCHLIST_17:
474 case TOP_PATCHLIST_18:
475 case TOP_PATCHLIST_19:
476 case TOP_PATCHLIST_20:
477 case TOP_PATCHLIST_21:
478 case TOP_PATCHLIST_22:
479 case TOP_PATCHLIST_23:
480 case TOP_PATCHLIST_24:
481 case TOP_PATCHLIST_25:
482 case TOP_PATCHLIST_26:
483 case TOP_PATCHLIST_27:
484 case TOP_PATCHLIST_28:
485 case TOP_PATCHLIST_29:
486 case TOP_PATCHLIST_30:
487 case TOP_PATCHLIST_31:
488 case TOP_PATCHLIST_32:
489 numVerts = topology - TOP_PATCHLIST_BASE;
490 break;
491 default:
492 SWR_ASSERT(false, "Unsupported topology: %d", topology);
493 break;
494 }
495
496 if (includeAdjVerts)
497 {
498 switch (topology)
499 {
500 case TOP_LISTSTRIP_ADJ:
501 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
502 case TOP_TRI_STRIP_ADJ:
503 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
504 default: break;
505 }
506 }
507
508 return numVerts;
509 }
510
511 //////////////////////////////////////////////////////////////////////////
512 /// @brief Generate mask from remaining work.
513 /// @param numWorkItems - Number of items being worked on by a SIMD.
514 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
515 {
516 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
517 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
518 return _simd_castps_si(vMask(mask));
519 }
520
521 //////////////////////////////////////////////////////////////////////////
522 /// @brief StreamOut - Streams vertex data out to SO buffers.
523 /// Generally, we are only streaming out a SIMDs worth of triangles.
524 /// @param pDC - pointer to draw context.
525 /// @param workerId - thread's worker id. Even thread has a unique id.
526 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
527 static void StreamOut(
528 DRAW_CONTEXT* pDC,
529 PA_STATE& pa,
530 uint32_t workerId,
531 uint32_t* pPrimData,
532 uint32_t streamIndex)
533 {
534 RDTSC_START(FEStreamout);
535
536 SWR_CONTEXT* pContext = pDC->pContext;
537
538 const API_STATE& state = GetApiState(pDC);
539 const SWR_STREAMOUT_STATE &soState = state.soState;
540
541 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
542
543 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
544 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
545
546 SWR_STREAMOUT_CONTEXT soContext = { 0 };
547
548 // Setup buffer state pointers.
549 for (uint32_t i = 0; i < 4; ++i)
550 {
551 soContext.pBuffer[i] = &state.soBuffer[i];
552 }
553
554 uint32_t numPrims = pa.NumPrims();
555 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
556 {
557 DWORD slot = 0;
558 uint32_t soMask = soState.streamMasks[streamIndex];
559
560 // Write all entries into primitive data buffer for SOS.
561 while (_BitScanForward(&slot, soMask))
562 {
563 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
564 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
565 pa.AssembleSingle(paSlot, primIndex, attrib);
566
567 // Attribute offset is relative offset from start of vertex.
568 // Note that attributes start at slot 1 in the PA buffer. We need to write this
569 // to prim data starting at slot 0. Which is why we do (slot - 1).
570 // Also note: GL works slightly differently, and needs slot 0
571 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
572
573 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
574 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
575 {
576 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
577
578 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
579 }
580 soMask &= ~(1 << slot);
581 }
582
583 // Update pPrimData pointer
584 soContext.pPrimData = pPrimData;
585
586 // Call SOS
587 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
588 state.pfnSoFunc[streamIndex](soContext);
589 }
590
591 // Update SO write offset. The driver provides memory for the update.
592 for (uint32_t i = 0; i < 4; ++i)
593 {
594 if (state.soBuffer[i].pWriteOffset)
595 {
596 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
597 }
598
599 if (state.soBuffer[i].soWriteEnable)
600 {
601 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
602 pDC->dynState.SoWriteOffsetDirty[i] = true;
603 }
604 }
605
606 UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
607 UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
608
609 RDTSC_STOP(FEStreamout, 1, 0);
610 }
611
612 //////////////////////////////////////////////////////////////////////////
613 /// @brief Computes number of invocations. The current index represents
614 /// the start of the SIMD. The max index represents how much work
615 /// items are remaining. If there is less then a SIMD's left of work
616 /// then return the remaining amount of work.
617 /// @param curIndex - The start index for the SIMD.
618 /// @param maxIndex - The last index for all work items.
619 static INLINE uint32_t GetNumInvocations(
620 uint32_t curIndex,
621 uint32_t maxIndex)
622 {
623 uint32_t remainder = (maxIndex - curIndex);
624 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
625 }
626
627 //////////////////////////////////////////////////////////////////////////
628 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
629 /// The geometry shader will loop over each active streamout buffer, assembling
630 /// primitives for the downstream stages. When multistream output is enabled,
631 /// the generated stream ID buffer from the GS needs to be converted to a cut
632 /// buffer for the primitive assembler.
633 /// @param stream - stream id to generate the cut buffer for
634 /// @param pStreamIdBase - pointer to the stream ID buffer
635 /// @param numEmittedVerts - Number of total verts emitted by the GS
636 /// @param pCutBuffer - output buffer to write cuts to
637 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
638 {
639 SWR_ASSERT(stream < MAX_SO_STREAMS);
640
641 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
642 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
643
644 for (uint32_t b = 0; b < numOutputBytes; ++b)
645 {
646 uint8_t curInputByte = pStreamIdBase[2*b];
647 uint8_t outByte = 0;
648 for (uint32_t i = 0; i < 4; ++i)
649 {
650 if ((curInputByte & 0x3) != stream)
651 {
652 outByte |= (1 << i);
653 }
654 curInputByte >>= 2;
655 }
656
657 curInputByte = pStreamIdBase[2 * b + 1];
658 for (uint32_t i = 0; i < 4; ++i)
659 {
660 if ((curInputByte & 0x3) != stream)
661 {
662 outByte |= (1 << (i + 4));
663 }
664 curInputByte >>= 2;
665 }
666
667 *pCutBuffer++ = outByte;
668 }
669 }
670
671 THREAD SWR_GS_CONTEXT tlsGsContext;
672
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Implements GS stage.
675 /// @param pDC - pointer to draw context.
676 /// @param workerId - thread's worker id. Even thread has a unique id.
677 /// @param pa - The primitive assembly object.
678 /// @param pGsOut - output stream for GS
679 template <
680 typename HasStreamOutT,
681 typename HasRastT>
682 static void GeometryShaderStage(
683 DRAW_CONTEXT *pDC,
684 uint32_t workerId,
685 PA_STATE& pa,
686 void* pGsOut,
687 void* pCutBuffer,
688 void* pStreamCutBuffer,
689 uint32_t* pSoPrimData,
690 simdscalari primID)
691 {
692 RDTSC_START(FEGeometryShader);
693
694 SWR_CONTEXT* pContext = pDC->pContext;
695
696 const API_STATE& state = GetApiState(pDC);
697 const SWR_GS_STATE* pState = &state.gsState;
698
699 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
700 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
701
702 tlsGsContext.pStream = (uint8_t*)pGsOut;
703 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
704 tlsGsContext.PrimitiveID = primID;
705
706 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
707 simdvector attrib[MAX_ATTRIBUTES];
708
709 // assemble all attributes for the input primitive
710 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
711 {
712 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
713 pa.Assemble(attribSlot, attrib);
714
715 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
716 {
717 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
718 }
719 }
720
721 // assemble position
722 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
723 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
724 {
725 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
726 }
727
728 const uint32_t vertexStride = sizeof(simdvertex);
729 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
730 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
731 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
732 uint32_t cutPrimStride;
733 uint32_t cutInstanceStride;
734
735 if (pState->isSingleStream)
736 {
737 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
738 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
739 }
740 else
741 {
742 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
743 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
744 }
745
746 // record valid prims from the frontend to avoid over binning the newly generated
747 // prims from the GS
748 uint32_t numInputPrims = pa.NumPrims();
749
750 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
751 {
752 tlsGsContext.InstanceID = instance;
753 tlsGsContext.mask = GenerateMask(numInputPrims);
754
755 // execute the geometry shader
756 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
757
758 tlsGsContext.pStream += instanceStride;
759 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
760 }
761
762 // set up new binner and state for the GS output topology
763 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
764 if (HasRastT::value)
765 {
766 switch (pState->outputTopology)
767 {
768 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
769 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
770 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
771 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
772 }
773 }
774
775 // foreach input prim:
776 // - setup a new PA based on the emitted verts for that prim
777 // - loop over the new verts, calling PA to assemble each prim
778 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
779 uint32_t* pPrimitiveId = (uint32_t*)&primID;
780
781 uint32_t totalPrimsGenerated = 0;
782 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
783 {
784 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
785 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
786 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
787 {
788 uint32_t numEmittedVerts = pVertexCount[inputPrim];
789 if (numEmittedVerts == 0)
790 {
791 continue;
792 }
793
794 uint8_t* pBase = pInstanceBase + instance * instanceStride;
795 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
796
797 uint32_t numAttribs = state.feNumAttributes;
798
799 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
800 {
801 bool processCutVerts = false;
802
803 uint8_t* pCutBuffer = pCutBase;
804
805 // assign default stream ID, only relevant when GS is outputting a single stream
806 uint32_t streamID = 0;
807 if (pState->isSingleStream)
808 {
809 processCutVerts = true;
810 streamID = pState->singleStreamID;
811 if (streamID != stream) continue;
812 }
813 else
814 {
815 // early exit if this stream is not enabled for streamout
816 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
817 {
818 continue;
819 }
820
821 // multi-stream output, need to translate StreamID buffer to a cut buffer
822 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
823 pCutBuffer = (uint8_t*)pStreamCutBuffer;
824 processCutVerts = false;
825 }
826
827 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
828
829 while (gsPa.GetNextStreamOutput())
830 {
831 do
832 {
833 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
834
835 if (assemble)
836 {
837 totalPrimsGenerated += gsPa.NumPrims();
838
839 if (HasStreamOutT::value)
840 {
841 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
842 }
843
844 if (HasRastT::value && state.soState.streamToRasterizer == stream)
845 {
846 simdscalari vPrimId;
847 // pull primitiveID from the GS output if available
848 if (state.gsState.emitsPrimitiveID)
849 {
850 simdvector primIdAttrib[3];
851 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
852 vPrimId = _simd_castps_si(primIdAttrib[0].x);
853 }
854 else
855 {
856 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
857 }
858
859 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
860 }
861 }
862 } while (gsPa.NextPrim());
863 }
864 }
865 }
866 }
867
868 // update GS pipeline stats
869 UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
870 UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
871
872 RDTSC_STOP(FEGeometryShader, 1, 0);
873 }
874
875 //////////////////////////////////////////////////////////////////////////
876 /// @brief Allocate GS buffers
877 /// @param pDC - pointer to draw context.
878 /// @param state - API state
879 /// @param ppGsOut - pointer to GS output buffer allocation
880 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
881 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
882 void **ppStreamCutBuffer)
883 {
884 auto pArena = pDC->pArena;
885 SWR_ASSERT(pArena != nullptr);
886 SWR_ASSERT(state.gsState.gsEnable);
887 // allocate arena space to hold GS output verts
888 // @todo pack attribs
889 // @todo support multiple streams
890 const uint32_t vertexStride = sizeof(simdvertex);
891 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
892 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
893 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
894
895 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
896 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
897 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
898 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
899
900 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
901 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
902
903 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
904 if (state.gsState.isSingleStream)
905 {
906 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
907 *ppStreamCutBuffer = nullptr;
908 }
909 else
910 {
911 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
912 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
913 }
914
915 }
916
917 //////////////////////////////////////////////////////////////////////////
918 /// @brief Contains all data generated by the HS and passed to the
919 /// tessellator and DS.
920 struct TessellationThreadLocalData
921 {
922 SWR_HS_CONTEXT hsContext;
923 ScalarPatch patchData[KNOB_SIMD_WIDTH];
924 void* pTxCtx;
925 size_t tsCtxSize;
926
927 simdscalar* pDSOutput;
928 size_t numDSOutputVectors;
929 };
930
931 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
932
933 //////////////////////////////////////////////////////////////////////////
934 /// @brief Allocate tessellation data for this worker thread.
935 INLINE
936 static void AllocateTessellationData(SWR_CONTEXT* pContext)
937 {
938 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
939 if (gt_pTessellationThreadData == nullptr)
940 {
941 gt_pTessellationThreadData = (TessellationThreadLocalData*)
942 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
943 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
944 }
945 }
946
947 //////////////////////////////////////////////////////////////////////////
948 /// @brief Implements Tessellation Stages.
949 /// @param pDC - pointer to draw context.
950 /// @param workerId - thread's worker id. Even thread has a unique id.
951 /// @param pa - The primitive assembly object.
952 /// @param pGsOut - output stream for GS
953 template <
954 typename HasGeometryShaderT,
955 typename HasStreamOutT,
956 typename HasRastT>
957 static void TessellationStages(
958 DRAW_CONTEXT *pDC,
959 uint32_t workerId,
960 PA_STATE& pa,
961 void* pGsOut,
962 void* pCutBuffer,
963 void* pCutStreamBuffer,
964 uint32_t* pSoPrimData,
965 simdscalari primID)
966 {
967 const API_STATE& state = GetApiState(pDC);
968 const SWR_TS_STATE& tsState = state.tsState;
969 SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
970
971 SWR_ASSERT(gt_pTessellationThreadData);
972
973 HANDLE tsCtx = TSInitCtx(
974 tsState.domain,
975 tsState.partitioning,
976 tsState.tsOutputTopology,
977 gt_pTessellationThreadData->pTxCtx,
978 gt_pTessellationThreadData->tsCtxSize);
979 if (tsCtx == nullptr)
980 {
981 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
982 tsCtx = TSInitCtx(
983 tsState.domain,
984 tsState.partitioning,
985 tsState.tsOutputTopology,
986 gt_pTessellationThreadData->pTxCtx,
987 gt_pTessellationThreadData->tsCtxSize);
988 }
989 SWR_ASSERT(tsCtx);
990
991 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
992 if (HasRastT::value)
993 {
994 switch (tsState.postDSTopology)
995 {
996 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
997 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
998 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
999 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1000 }
1001 }
1002
1003 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1004 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1005 hsContext.PrimitiveID = primID;
1006
1007 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1008 // Max storage for one attribute for an entire simdprimitive
1009 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1010
1011 // assemble all attributes for the input primitives
1012 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1013 {
1014 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1015 pa.Assemble(attribSlot, simdattrib);
1016
1017 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1018 {
1019 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1020 }
1021 }
1022
1023 #if defined(_DEBUG)
1024 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1025 #endif
1026
1027 uint32_t numPrims = pa.NumPrims();
1028 hsContext.mask = GenerateMask(numPrims);
1029
1030 // Run the HS
1031 RDTSC_START(FEHullShader);
1032 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1033 RDTSC_STOP(FEHullShader, 0, 0);
1034
1035 UPDATE_STAT(HsInvocations, numPrims);
1036
1037 const uint32_t* pPrimId = (const uint32_t*)&primID;
1038
1039 for (uint32_t p = 0; p < numPrims; ++p)
1040 {
1041 // Run Tessellator
1042 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1043 RDTSC_START(FETessellation);
1044 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1045 RDTSC_STOP(FETessellation, 0, 0);
1046
1047 if (tsData.NumPrimitives == 0)
1048 {
1049 continue;
1050 }
1051 SWR_ASSERT(tsData.NumDomainPoints);
1052
1053 // Allocate DS Output memory
1054 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1055 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1056 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1057 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1058 {
1059 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1060 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1061 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1062 }
1063 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1064 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1065
1066 #if defined(_DEBUG)
1067 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1068 #endif
1069
1070 // Run Domain Shader
1071 SWR_DS_CONTEXT dsContext;
1072 dsContext.PrimitiveID = pPrimId[p];
1073 dsContext.pCpIn = &hsContext.pCPout[p];
1074 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1075 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1076 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1077 dsContext.vectorStride = requiredDSVectorInvocations;
1078
1079 uint32_t dsInvocations = 0;
1080
1081 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1082 {
1083 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1084
1085 RDTSC_START(FEDomainShader);
1086 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1087 RDTSC_STOP(FEDomainShader, 0, 0);
1088
1089 dsInvocations += KNOB_SIMD_WIDTH;
1090 }
1091 UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1092
1093 PA_TESS tessPa(
1094 pDC,
1095 dsContext.pOutputData,
1096 dsContext.vectorStride,
1097 tsState.numDsOutputAttribs,
1098 tsData.ppIndices,
1099 tsData.NumPrimitives,
1100 tsState.postDSTopology);
1101
1102 while (tessPa.HasWork())
1103 {
1104 if (HasGeometryShaderT::value)
1105 {
1106 GeometryShaderStage<HasStreamOutT, HasRastT>(
1107 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1108 _simd_set1_epi32(dsContext.PrimitiveID));
1109 }
1110 else
1111 {
1112 if (HasStreamOutT::value)
1113 {
1114 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1115 }
1116
1117 if (HasRastT::value)
1118 {
1119 simdvector prim[3]; // Only deal with triangles, lines, or points
1120 RDTSC_START(FEPAAssemble);
1121 #if SWR_ENABLE_ASSERTS
1122 bool assemble =
1123 #endif
1124 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1125 RDTSC_STOP(FEPAAssemble, 1, 0);
1126 SWR_ASSERT(assemble);
1127
1128 SWR_ASSERT(pfnClipFunc);
1129 pfnClipFunc(pDC, tessPa, workerId, prim,
1130 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1131 }
1132 }
1133
1134 tessPa.NextPrim();
1135
1136 } // while (tessPa.HasWork())
1137 } // for (uint32_t p = 0; p < numPrims; ++p)
1138
1139 TSDestroyCtx(tsCtx);
1140 }
1141
1142 //////////////////////////////////////////////////////////////////////////
1143 /// @brief FE handler for SwrDraw.
1144 /// @tparam IsIndexedT - Is indexed drawing enabled
1145 /// @tparam HasTessellationT - Is tessellation enabled
1146 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1147 /// @tparam HasStreamOutT - Is stream-out enabled
1148 /// @tparam HasRastT - Is rasterization enabled
1149 /// @param pContext - pointer to SWR context.
1150 /// @param pDC - pointer to draw context.
1151 /// @param workerId - thread's worker id.
1152 /// @param pUserData - Pointer to DRAW_WORK
1153 template <
1154 typename IsIndexedT,
1155 typename IsCutIndexEnabledT,
1156 typename HasTessellationT,
1157 typename HasGeometryShaderT,
1158 typename HasStreamOutT,
1159 typename HasRastT>
1160 void ProcessDraw(
1161 SWR_CONTEXT *pContext,
1162 DRAW_CONTEXT *pDC,
1163 uint32_t workerId,
1164 void *pUserData)
1165 {
1166
1167 #if KNOB_ENABLE_TOSS_POINTS
1168 if (KNOB_TOSS_QUEUE_FE)
1169 {
1170 return;
1171 }
1172 #endif
1173
1174 RDTSC_START(FEProcessDraw);
1175
1176 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1177 const API_STATE& state = GetApiState(pDC);
1178 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1179 SWR_VS_CONTEXT vsContext;
1180 simdvertex vin;
1181
1182 int indexSize = 0;
1183 uint32_t endVertex = work.numVerts;
1184
1185 const int32_t* pLastRequestedIndex = nullptr;
1186 if (IsIndexedT::value)
1187 {
1188 switch (work.type)
1189 {
1190 case R32_UINT:
1191 indexSize = sizeof(uint32_t);
1192 pLastRequestedIndex = &(work.pIB[endVertex]);
1193 break;
1194 case R16_UINT:
1195 indexSize = sizeof(uint16_t);
1196 // nasty address offset to last index
1197 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1198 break;
1199 case R8_UINT:
1200 indexSize = sizeof(uint8_t);
1201 // nasty address offset to last index
1202 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1203 break;
1204 default:
1205 SWR_ASSERT(0);
1206 }
1207 }
1208 else
1209 {
1210 // No cuts, prune partial primitives.
1211 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1212 }
1213
1214 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1215 fetchInfo.pStreams = &state.vertexBuffers[0];
1216 fetchInfo.StartInstance = work.startInstance;
1217 fetchInfo.StartVertex = 0;
1218
1219 vsContext.pVin = &vin;
1220
1221 if (IsIndexedT::value)
1222 {
1223 fetchInfo.BaseVertex = work.baseVertex;
1224
1225 // if the entire index buffer isn't being consumed, set the last index
1226 // so that fetches < a SIMD wide will be masked off
1227 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1228 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1229 {
1230 fetchInfo.pLastIndex = pLastRequestedIndex;
1231 }
1232 }
1233 else
1234 {
1235 fetchInfo.StartVertex = work.startVertex;
1236 }
1237
1238 #ifdef KNOB_ENABLE_RDTSC
1239 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1240 #endif
1241
1242 void* pGsOut = nullptr;
1243 void* pCutBuffer = nullptr;
1244 void* pStreamCutBuffer = nullptr;
1245 if (HasGeometryShaderT::value)
1246 {
1247 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1248 }
1249
1250 if (HasTessellationT::value)
1251 {
1252 SWR_ASSERT(state.tsState.tsEnable == true);
1253 SWR_ASSERT(state.pfnHsFunc != nullptr);
1254 SWR_ASSERT(state.pfnDsFunc != nullptr);
1255
1256 AllocateTessellationData(pContext);
1257 }
1258 else
1259 {
1260 SWR_ASSERT(state.tsState.tsEnable == false);
1261 SWR_ASSERT(state.pfnHsFunc == nullptr);
1262 SWR_ASSERT(state.pfnDsFunc == nullptr);
1263 }
1264
1265 // allocate space for streamout input prim data
1266 uint32_t* pSoPrimData = nullptr;
1267 if (HasStreamOutT::value)
1268 {
1269 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1270 }
1271
1272 // choose primitive assembler
1273 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1274 PA_STATE& pa = paFactory.GetPA();
1275
1276 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1277 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1278 {
1279 simdscalari vIndex;
1280 uint32_t i = 0;
1281
1282 if (IsIndexedT::value)
1283 {
1284 fetchInfo.pIndices = work.pIB;
1285 }
1286 else
1287 {
1288 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1289 fetchInfo.pIndices = (const int32_t*)&vIndex;
1290 }
1291
1292 fetchInfo.CurInstance = instanceNum;
1293 vsContext.InstanceID = instanceNum;
1294
1295 while (pa.HasWork())
1296 {
1297 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1298 // So we need to keep this outside of (i < endVertex) check.
1299 simdmask* pvCutIndices = nullptr;
1300 if (IsIndexedT::value)
1301 {
1302 pvCutIndices = &pa.GetNextVsIndices();
1303 }
1304
1305 simdvertex& vout = pa.GetNextVsOutput();
1306 vsContext.pVout = &vout;
1307
1308 if (i < endVertex)
1309 {
1310
1311 // 1. Execute FS/VS for a single SIMD.
1312 RDTSC_START(FEFetchShader);
1313 state.pfnFetchFunc(fetchInfo, vin);
1314 RDTSC_STOP(FEFetchShader, 0, 0);
1315
1316 // forward fetch generated vertex IDs to the vertex shader
1317 vsContext.VertexID = fetchInfo.VertexID;
1318
1319 // Setup active mask for vertex shader.
1320 vsContext.mask = GenerateMask(endVertex - i);
1321
1322 // forward cut mask to the PA
1323 if (IsIndexedT::value)
1324 {
1325 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1326 }
1327
1328 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1329
1330 #if KNOB_ENABLE_TOSS_POINTS
1331 if (!KNOB_TOSS_FETCH)
1332 #endif
1333 {
1334 RDTSC_START(FEVertexShader);
1335 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1336 RDTSC_STOP(FEVertexShader, 0, 0);
1337
1338 UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1339 }
1340 }
1341
1342 // 2. Assemble primitives given the last two SIMD.
1343 do
1344 {
1345 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1346 // PaAssemble returns false if there is not enough verts to assemble.
1347 RDTSC_START(FEPAAssemble);
1348 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1349 RDTSC_STOP(FEPAAssemble, 1, 0);
1350
1351 #if KNOB_ENABLE_TOSS_POINTS
1352 if (!KNOB_TOSS_FETCH)
1353 #endif
1354 {
1355 #if KNOB_ENABLE_TOSS_POINTS
1356 if (!KNOB_TOSS_VS)
1357 #endif
1358 {
1359 if (assemble)
1360 {
1361 UPDATE_STAT(IaPrimitives, pa.NumPrims());
1362
1363 if (HasTessellationT::value)
1364 {
1365 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1366 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1367 }
1368 else if (HasGeometryShaderT::value)
1369 {
1370 GeometryShaderStage<HasStreamOutT, HasRastT>(
1371 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1372 }
1373 else
1374 {
1375 // If streamout is enabled then stream vertices out to memory.
1376 if (HasStreamOutT::value)
1377 {
1378 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1379 }
1380
1381 if (HasRastT::value)
1382 {
1383 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1384 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1385 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1386 }
1387 }
1388 }
1389 }
1390 }
1391 } while (pa.NextPrim());
1392
1393 i += KNOB_SIMD_WIDTH;
1394 if (IsIndexedT::value)
1395 {
1396 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1397 }
1398 else
1399 {
1400 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1401 }
1402 }
1403 pa.Reset();
1404 }
1405
1406 RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1407 }
1408
1409 struct FEDrawChooser
1410 {
1411 typedef PFN_FE_WORK_FUNC FuncType;
1412
1413 template <typename... ArgsB>
1414 static FuncType GetFunc()
1415 {
1416 return ProcessDraw<ArgsB...>;
1417 }
1418 };
1419
1420
1421 // Selector for correct templated Draw front-end function
1422 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1423 bool IsIndexed,
1424 bool IsCutIndexEnabled,
1425 bool HasTessellation,
1426 bool HasGeometryShader,
1427 bool HasStreamOut,
1428 bool HasRasterization)
1429 {
1430 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1431 }
1432
1433 //////////////////////////////////////////////////////////////////////////
1434 /// @brief Processes attributes for the backend based on linkage mask and
1435 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1436 /// @param pDC - Draw context
1437 /// @param pa - Primitive Assembly state
1438 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1439 /// @param pLinkageMap - maps VS attribute slot to PS slot
1440 /// @param triIndex - Triangle to process attributes for
1441 /// @param pBuffer - Output result
1442 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1443 INLINE void ProcessAttributes(
1444 DRAW_CONTEXT *pDC,
1445 PA_STATE&pa,
1446 uint32_t triIndex,
1447 uint32_t primId,
1448 float *pBuffer)
1449 {
1450 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1451 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1452 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1453 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1454 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1455 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1456
1457 static const float constTable[3][4] = {
1458 {0.0f, 0.0f, 0.0f, 0.0f},
1459 {0.0f, 0.0f, 0.0f, 1.0f},
1460 {1.0f, 1.0f, 1.0f, 1.0f}
1461 };
1462
1463 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1464 {
1465 uint32_t inputSlot;
1466 if (IsSwizzledT::value)
1467 {
1468 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1469 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1470
1471 }
1472 else
1473 {
1474 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1475 }
1476
1477 __m128 attrib[3]; // triangle attribs (always 4 wide)
1478 float* pAttribStart = pBuffer;
1479
1480 if (HasConstantInterpT::value || IsDegenerate::value)
1481 {
1482 if (_bittest(&constantInterpMask, i))
1483 {
1484 uint32_t vid;
1485 uint32_t adjustedTriIndex;
1486 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1487 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1488 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1489 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1490 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1491
1492 switch (topo) {
1493 case TOP_QUAD_LIST:
1494 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1495 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1496 break;
1497 case TOP_QUAD_STRIP:
1498 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1499 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1500 break;
1501 case TOP_TRIANGLE_STRIP:
1502 adjustedTriIndex = triIndex;
1503 vid = (triIndex & 1)
1504 ? tristripProvokingVertex[provokingVertex]
1505 : provokingVertex;
1506 break;
1507 default:
1508 adjustedTriIndex = triIndex;
1509 vid = provokingVertex;
1510 break;
1511 }
1512
1513 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1514
1515 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1516 {
1517 _mm_store_ps(pBuffer, attrib[vid]);
1518 pBuffer += 4;
1519 }
1520 }
1521 else
1522 {
1523 pa.AssembleSingle(inputSlot, triIndex, attrib);
1524
1525 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1526 {
1527 _mm_store_ps(pBuffer, attrib[i]);
1528 pBuffer += 4;
1529 }
1530 }
1531 }
1532 else
1533 {
1534 pa.AssembleSingle(inputSlot, triIndex, attrib);
1535
1536 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1537 {
1538 _mm_store_ps(pBuffer, attrib[i]);
1539 pBuffer += 4;
1540 }
1541 }
1542
1543 // pad out the attrib buffer to 3 verts to ensure the triangle
1544 // interpolation code in the pixel shader works correctly for the
1545 // 3 topologies - point, line, tri. This effectively zeros out the
1546 // effect of the missing vertices in the triangle interpolation.
1547 for (uint32_t v = NumVertsT::value; v < 3; ++v)
1548 {
1549 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1550 pBuffer += 4;
1551 }
1552
1553 // check for constant source overrides
1554 if (IsSwizzledT::value)
1555 {
1556 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1557 if (mask)
1558 {
1559 DWORD comp;
1560 while (_BitScanForward(&comp, mask))
1561 {
1562 mask &= ~(1 << comp);
1563
1564 float constantValue = 0.0f;
1565 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1566 {
1567 case SWR_CONSTANT_SOURCE_CONST_0000:
1568 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1569 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1570 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1571 break;
1572 case SWR_CONSTANT_SOURCE_PRIM_ID:
1573 constantValue = *(float*)&primId;
1574 break;
1575 }
1576
1577 // apply constant value to all 3 vertices
1578 for (uint32_t v = 0; v < 3; ++v)
1579 {
1580 pAttribStart[comp + v * 4] = constantValue;
1581 }
1582 }
1583 }
1584 }
1585 }
1586 }
1587
1588
1589 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1590
1591 struct ProcessAttributesChooser
1592 {
1593 typedef PFN_PROCESS_ATTRIBUTES FuncType;
1594
1595 template <typename... ArgsB>
1596 static FuncType GetFunc()
1597 {
1598 return ProcessAttributes<ArgsB...>;
1599 }
1600 };
1601
1602 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1603 {
1604 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1605 }
1606
1607 //////////////////////////////////////////////////////////////////////////
1608 /// @brief Processes enabled user clip distances. Loads the active clip
1609 /// distances from the PA, sets up barycentric equations, and
1610 /// stores the results to the output buffer
1611 /// @param pa - Primitive Assembly state
1612 /// @param primIndex - primitive index to process
1613 /// @param clipDistMask - mask of enabled clip distances
1614 /// @param pUserClipBuffer - buffer to store results
1615 template<uint32_t NumVerts>
1616 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1617 {
1618 DWORD clipDist;
1619 while (_BitScanForward(&clipDist, clipDistMask))
1620 {
1621 clipDistMask &= ~(1 << clipDist);
1622 uint32_t clipSlot = clipDist >> 2;
1623 uint32_t clipComp = clipDist & 0x3;
1624 uint32_t clipAttribSlot = clipSlot == 0 ?
1625 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1626
1627 __m128 primClipDist[3];
1628 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1629
1630 float vertClipDist[NumVerts];
1631 for (uint32_t e = 0; e < NumVerts; ++e)
1632 {
1633 OSALIGNSIMD(float) aVertClipDist[4];
1634 _mm_store_ps(aVertClipDist, primClipDist[e]);
1635 vertClipDist[e] = aVertClipDist[clipComp];
1636 };
1637
1638 // setup plane equations for barycentric interpolation in the backend
1639 float baryCoeff[NumVerts];
1640 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1641 {
1642 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1643 }
1644 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1645
1646 for (uint32_t e = 0; e < NumVerts; ++e)
1647 {
1648 *(pUserClipBuffer++) = baryCoeff[e];
1649 }
1650 }
1651 }
1652
1653 //////////////////////////////////////////////////////////////////////////
1654 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1655 /// Point precision from FP32.
1656 template <typename PT = FixedPointTraits<Fixed_16_8>>
1657 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1658 {
1659 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1660 return _simd_cvtps_epi32(vFixed);
1661 }
1662
1663 //////////////////////////////////////////////////////////////////////////
1664 /// @brief Helper function to set the X,Y coords of a triangle to the
1665 /// requested Fixed Point precision from FP32.
1666 /// @param tri: simdvector[3] of FP triangle verts
1667 /// @param vXi: fixed point X coords of tri verts
1668 /// @param vYi: fixed point Y coords of tri verts
1669 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1670 {
1671 vXi[0] = fpToFixedPointVertical(tri[0].x);
1672 vYi[0] = fpToFixedPointVertical(tri[0].y);
1673 vXi[1] = fpToFixedPointVertical(tri[1].x);
1674 vYi[1] = fpToFixedPointVertical(tri[1].y);
1675 vXi[2] = fpToFixedPointVertical(tri[2].x);
1676 vYi[2] = fpToFixedPointVertical(tri[2].y);
1677 }
1678
1679 //////////////////////////////////////////////////////////////////////////
1680 /// @brief Calculate bounding box for current triangle
1681 /// @tparam CT: ConservativeRastFETraits type
1682 /// @param vX: fixed point X position for triangle verts
1683 /// @param vY: fixed point Y position for triangle verts
1684 /// @param bbox: fixed point bbox
1685 /// *Note*: expects vX, vY to be in the correct precision for the type
1686 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1687 template <typename CT>
1688 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1689 {
1690 simdscalari vMinX = vX[0];
1691 vMinX = _simd_min_epi32(vMinX, vX[1]);
1692 vMinX = _simd_min_epi32(vMinX, vX[2]);
1693
1694 simdscalari vMaxX = vX[0];
1695 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1696 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1697
1698 simdscalari vMinY = vY[0];
1699 vMinY = _simd_min_epi32(vMinY, vY[1]);
1700 vMinY = _simd_min_epi32(vMinY, vY[2]);
1701
1702 simdscalari vMaxY = vY[0];
1703 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1704 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1705
1706 bbox.left = vMinX;
1707 bbox.right = vMaxX;
1708 bbox.top = vMinY;
1709 bbox.bottom = vMaxY;
1710 }
1711
1712 //////////////////////////////////////////////////////////////////////////
1713 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1714 /// Offsets BBox for conservative rast
1715 template <>
1716 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1717 {
1718 // FE conservative rast traits
1719 typedef FEConservativeRastT CT;
1720
1721 simdscalari vMinX = vX[0];
1722 vMinX = _simd_min_epi32(vMinX, vX[1]);
1723 vMinX = _simd_min_epi32(vMinX, vX[2]);
1724
1725 simdscalari vMaxX = vX[0];
1726 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1727 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1728
1729 simdscalari vMinY = vY[0];
1730 vMinY = _simd_min_epi32(vMinY, vY[1]);
1731 vMinY = _simd_min_epi32(vMinY, vY[2]);
1732
1733 simdscalari vMaxY = vY[0];
1734 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1735 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1736
1737 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1738 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1739 bbox.left = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1740 bbox.right = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1741 bbox.top = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1742 bbox.bottom = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1743 }
1744
1745 //////////////////////////////////////////////////////////////////////////
1746 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1747 /// culling, viewport transform, etc.
1748 /// @param pDC - pointer to draw context.
1749 /// @param pa - The primitive assembly object.
1750 /// @param workerId - thread's worker id. Even thread has a unique id.
1751 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1752 /// @param primID - Primitive ID for each triangle.
1753 /// @tparam CT - ConservativeRastFETraits
1754 template <typename CT>
1755 void BinTriangles(
1756 DRAW_CONTEXT *pDC,
1757 PA_STATE& pa,
1758 uint32_t workerId,
1759 simdvector tri[3],
1760 uint32_t triMask,
1761 simdscalari primID)
1762 {
1763 RDTSC_START(FEBinTriangles);
1764
1765 const API_STATE& state = GetApiState(pDC);
1766 const SWR_RASTSTATE& rastState = state.rastState;
1767 const SWR_FRONTEND_STATE& feState = state.frontendState;
1768 const SWR_GS_STATE& gsState = state.gsState;
1769 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1770
1771
1772 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1773 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1774 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1775
1776 if (!feState.vpTransformDisable)
1777 {
1778 // perspective divide
1779 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1780 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1781 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1782
1783 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1784 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1785 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1786
1787 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1788 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1789 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1790
1791 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1792 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1793 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1794
1795 // viewport transform to screen coords
1796 viewportTransform<3>(tri, state.vpMatrix[0]);
1797 }
1798
1799 // adjust for pixel center location
1800 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1801 tri[0].x = _simd_add_ps(tri[0].x, offset);
1802 tri[0].y = _simd_add_ps(tri[0].y, offset);
1803
1804 tri[1].x = _simd_add_ps(tri[1].x, offset);
1805 tri[1].y = _simd_add_ps(tri[1].y, offset);
1806
1807 tri[2].x = _simd_add_ps(tri[2].x, offset);
1808 tri[2].y = _simd_add_ps(tri[2].y, offset);
1809
1810 simdscalari vXi[3], vYi[3];
1811 // Set vXi, vYi to required fixed point precision
1812 FPToFixedPoint(tri, vXi, vYi);
1813
1814 // triangle setup
1815 simdscalari vAi[3], vBi[3];
1816 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1817
1818 // determinant
1819 simdscalari vDet[2];
1820 calcDeterminantIntVertical(vAi, vBi, vDet);
1821
1822 // cull zero area
1823 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1824 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1825
1826 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1827
1828 uint32_t origTriMask = triMask;
1829 // don't cull degenerate triangles if we're conservatively rasterizing
1830 if(!CT::IsConservativeT::value)
1831 {
1832 triMask &= ~cullZeroAreaMask;
1833 }
1834
1835 // determine front winding tris
1836 // CW +det
1837 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1838 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1839 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1840 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1841
1842 uint32_t frontWindingTris;
1843 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1844 {
1845 frontWindingTris = cwTriMask;
1846 }
1847 else
1848 {
1849 frontWindingTris = ~cwTriMask;
1850 }
1851
1852 // cull
1853 uint32_t cullTris;
1854 switch ((SWR_CULLMODE)rastState.cullMode)
1855 {
1856 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1857 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1858 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1859 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1860 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1861 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1862 }
1863
1864 triMask &= ~cullTris;
1865
1866 if (origTriMask ^ triMask)
1867 {
1868 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1869 }
1870
1871 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1872 // compute per tri backface
1873 uint32_t frontFaceMask = frontWindingTris;
1874 uint32_t *pPrimID = (uint32_t *)&primID;
1875 DWORD triIndex = 0;
1876 // for center sample pattern, all samples are at pixel center; calculate coverage
1877 // once at center and broadcast the results in the backend
1878 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1879 uint32_t edgeEnable;
1880 PFN_WORK_FUNC pfnWork;
1881 if(CT::IsConservativeT::value)
1882 {
1883 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1884 // used to call the appropriate templated rasterizer function
1885 if(cullZeroAreaMask > 0)
1886 {
1887 // e0 = v1-v0
1888 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1889 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1890 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1891
1892 // e1 = v2-v1
1893 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1894 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1895 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1896
1897 // e2 = v0-v2
1898 // if v0 == v1 & v1 == v2, v0 == v2
1899 uint32_t e2Mask = e0Mask & e1Mask;
1900 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1901
1902 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1903 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1904 e0Mask = pdep_u32(e0Mask, 0x00249249);
1905 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1906 e1Mask = pdep_u32(e1Mask, 0x00492492);
1907 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1908 e2Mask = pdep_u32(e2Mask, 0x00924924);
1909
1910 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1911 }
1912 else
1913 {
1914 edgeEnable = 0x00FFFFFF;
1915 }
1916 }
1917 else
1918 {
1919 // degenerate triangles won't be sent to rasterizer; just enable all edges
1920 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1921 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1922 (rastState.scissorEnable > 0));
1923 }
1924
1925 if (!triMask)
1926 {
1927 goto endBinTriangles;
1928 }
1929
1930 // Calc bounding box of triangles
1931 simdBBox bbox;
1932 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1933
1934 // determine if triangle falls between pixel centers and discard
1935 // only discard for non-MSAA case and when conservative rast is disabled
1936 // (left + 127) & ~255
1937 // (right + 128) & ~255
1938 if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1939 {
1940 origTriMask = triMask;
1941
1942 int cullCenterMask;
1943 {
1944 simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1945 left = _simd_and_si(left, _simd_set1_epi32(~255));
1946 simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1947 right = _simd_and_si(right, _simd_set1_epi32(~255));
1948
1949 simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1950
1951 simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1952 top = _simd_and_si(top, _simd_set1_epi32(~255));
1953 simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1954 bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1955
1956 simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1957 vMaskV = _simd_or_si(vMaskH, vMaskV);
1958 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1959 }
1960
1961 triMask &= ~cullCenterMask;
1962
1963 if(origTriMask ^ triMask)
1964 {
1965 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1966 }
1967 }
1968
1969 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1970 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1971 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1972 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1973 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1974
1975 if(CT::IsConservativeT::value)
1976 {
1977 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1978 // some area. Bump the right/bottom edges out
1979 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.top, bbox.bottom);
1980 bbox.bottom = _simd_blendv_epi32(bbox.bottom, _simd_add_epi32(bbox.bottom, _simd_set1_epi32(1)), topEqualsBottom);
1981 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.left, bbox.right);
1982 bbox.right = _simd_blendv_epi32(bbox.right, _simd_add_epi32(bbox.right, _simd_set1_epi32(1)), leftEqualsRight);
1983 }
1984
1985 // Cull tris completely outside scissor
1986 {
1987 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1988 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1989 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1990 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1991 triMask = triMask & ~maskOutsideScissor;
1992 }
1993
1994 if (!triMask)
1995 {
1996 goto endBinTriangles;
1997 }
1998
1999 // Convert triangle bbox to macrotile units.
2000 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2001 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2002 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2003 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2004
2005 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2006 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2007 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2008 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2009 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2010
2011 // transpose verts needed for backend
2012 /// @todo modify BE to take non-transformed verts
2013 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2014 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2015 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2016 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2017 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2018
2019 // store render target array index
2020 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2021 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2022 {
2023 simdvector vRtai[3];
2024 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2025 simdscalari vRtaii;
2026 vRtaii = _simd_castps_si(vRtai[0].x);
2027 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2028 }
2029 else
2030 {
2031 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2032 }
2033
2034 // scan remaining valid triangles and bin each separately
2035 while (_BitScanForward(&triIndex, triMask))
2036 {
2037 uint32_t linkageCount = state.backendState.numAttributes;
2038 uint32_t numScalarAttribs = linkageCount * 4;
2039
2040 BE_WORK work;
2041 work.type = DRAW;
2042
2043 bool isDegenerate;
2044 if(CT::IsConservativeT::value)
2045 {
2046 // only rasterize valid edges if we have a degenerate primitive
2047 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2048 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2049 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2050 (rastState.scissorEnable > 0));
2051
2052 // Degenerate triangles are required to be constant interpolated
2053 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2054 }
2055 else
2056 {
2057 isDegenerate = false;
2058 work.pfnWork = pfnWork;
2059 }
2060
2061 // Select attribute processor
2062 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2063 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
2064
2065 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2066
2067 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2068 desc.triFlags.primID = pPrimID[triIndex];
2069 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2070
2071 auto pArena = pDC->pArena;
2072 SWR_ASSERT(pArena != nullptr);
2073
2074 // store active attribs
2075 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2076 desc.pAttribs = pAttribs;
2077 desc.numAttribs = linkageCount;
2078 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2079
2080 // store triangle vertex data
2081 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2082
2083 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2084 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2085 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2086 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2087
2088 // store user clip distances
2089 if (rastState.clipDistanceMask)
2090 {
2091 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2092 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2093 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2094 }
2095
2096 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2097 {
2098 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2099 {
2100 #if KNOB_ENABLE_TOSS_POINTS
2101 if (!KNOB_TOSS_SETUP_TRIS)
2102 #endif
2103 {
2104 pTileMgr->enqueue(x, y, &work);
2105 }
2106 }
2107 }
2108 triMask &= ~(1 << triIndex);
2109 }
2110
2111 endBinTriangles:
2112 RDTSC_STOP(FEBinTriangles, 1, 0);
2113 }
2114
2115 struct FEBinTrianglesChooser
2116 {
2117 typedef PFN_PROCESS_PRIMS FuncType;
2118
2119 template <typename... ArgsB>
2120 static FuncType GetFunc()
2121 {
2122 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2123 }
2124 };
2125
2126 // Selector for correct templated BinTrinagles function
2127 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2128 {
2129 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2130 }
2131
2132 //////////////////////////////////////////////////////////////////////////
2133 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2134 /// @param pDC - pointer to draw context.
2135 /// @param pa - The primitive assembly object.
2136 /// @param workerId - thread's worker id. Even thread has a unique id.
2137 /// @param tri - Contains point position data for SIMDs worth of points.
2138 /// @param primID - Primitive ID for each point.
2139 void BinPoints(
2140 DRAW_CONTEXT *pDC,
2141 PA_STATE& pa,
2142 uint32_t workerId,
2143 simdvector prim[3],
2144 uint32_t primMask,
2145 simdscalari primID)
2146 {
2147 RDTSC_START(FEBinPoints);
2148
2149 simdvector& primVerts = prim[0];
2150
2151 const API_STATE& state = GetApiState(pDC);
2152 const SWR_FRONTEND_STATE& feState = state.frontendState;
2153 const SWR_GS_STATE& gsState = state.gsState;
2154 const SWR_RASTSTATE& rastState = state.rastState;
2155
2156 // Select attribute processor
2157 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2158 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2159
2160 if (!feState.vpTransformDisable)
2161 {
2162 // perspective divide
2163 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2164 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2165 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2166 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2167
2168 // viewport transform to screen coords
2169 viewportTransform<1>(&primVerts, state.vpMatrix[0]);
2170 }
2171
2172 // adjust for pixel center location
2173 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2174 primVerts.x = _simd_add_ps(primVerts.x, offset);
2175 primVerts.y = _simd_add_ps(primVerts.y, offset);
2176
2177 // convert to fixed point
2178 simdscalari vXi, vYi;
2179 vXi = fpToFixedPointVertical(primVerts.x);
2180 vYi = fpToFixedPointVertical(primVerts.y);
2181
2182 if (CanUseSimplePoints(pDC))
2183 {
2184 // adjust for top-left rule
2185 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2186 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2187
2188 // cull points off the top-left edge of the viewport
2189 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2190 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2191
2192 // compute macro tile coordinates
2193 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2194 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2195
2196 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2197 _simd_store_si((simdscalari*)aMacroX, macroX);
2198 _simd_store_si((simdscalari*)aMacroY, macroY);
2199
2200 // compute raster tile coordinates
2201 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2202 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2203
2204 // compute raster tile relative x,y for coverage mask
2205 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2206 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2207
2208 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2209 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2210
2211 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2212 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2213 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2214 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2215
2216 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2217 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2218 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2219 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2220
2221 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2222 _simd_store_ps((float*)aZ, primVerts.z);
2223
2224 // store render target array index
2225 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2226 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2227 {
2228 simdvector vRtai;
2229 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2230 simdscalari vRtaii = _simd_castps_si(vRtai.x);
2231 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2232 }
2233 else
2234 {
2235 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2236 }
2237
2238 uint32_t *pPrimID = (uint32_t *)&primID;
2239 DWORD primIndex = 0;
2240
2241 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2242
2243 // scan remaining valid triangles and bin each separately
2244 while (_BitScanForward(&primIndex, primMask))
2245 {
2246 uint32_t linkageCount = backendState.numAttributes;
2247 uint32_t numScalarAttribs = linkageCount * 4;
2248
2249 BE_WORK work;
2250 work.type = DRAW;
2251
2252 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2253
2254 // points are always front facing
2255 desc.triFlags.frontFacing = 1;
2256 desc.triFlags.primID = pPrimID[primIndex];
2257 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2258
2259 work.pfnWork = RasterizeSimplePoint;
2260
2261 auto pArena = pDC->pArena;
2262 SWR_ASSERT(pArena != nullptr);
2263
2264 // store attributes
2265 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2266 desc.pAttribs = pAttribs;
2267 desc.numAttribs = linkageCount;
2268
2269 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2270
2271 // store raster tile aligned x, y, perspective correct z
2272 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2273 desc.pTriBuffer = pTriBuffer;
2274 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2275 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2276 *pTriBuffer = aZ[primIndex];
2277
2278 uint32_t tX = aTileRelativeX[primIndex];
2279 uint32_t tY = aTileRelativeY[primIndex];
2280
2281 // pack the relative x,y into the coverageMask, the rasterizer will
2282 // generate the true coverage mask from it
2283 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2284
2285 // bin it
2286 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2287 #if KNOB_ENABLE_TOSS_POINTS
2288 if (!KNOB_TOSS_SETUP_TRIS)
2289 #endif
2290 {
2291 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2292 }
2293 primMask &= ~(1 << primIndex);
2294 }
2295 }
2296 else
2297 {
2298 // non simple points need to be potentially binned to multiple macro tiles
2299 simdscalar vPointSize;
2300 if (rastState.pointParam)
2301 {
2302 simdvector size[3];
2303 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2304 vPointSize = size[0].x;
2305 }
2306 else
2307 {
2308 vPointSize = _simd_set1_ps(rastState.pointSize);
2309 }
2310
2311 // bloat point to bbox
2312 simdBBox bbox;
2313 bbox.left = bbox.right = vXi;
2314 bbox.top = bbox.bottom = vYi;
2315
2316 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2317 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2318 bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2319 bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2320 bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2321 bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2322
2323 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2324 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2325 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2326 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2327 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2328
2329 // Cull bloated points completely outside scissor
2330 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2331 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2332 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2333 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2334 primMask = primMask & ~maskOutsideScissor;
2335
2336 // Convert bbox to macrotile units.
2337 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2338 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2339 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2340 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2341
2342 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2343 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2344 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2345 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2346 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2347
2348 // store render target array index
2349 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2350 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2351 {
2352 simdvector vRtai[2];
2353 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2354 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2355 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2356 }
2357 else
2358 {
2359 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2360 }
2361
2362 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2363 _simd_store_ps((float*)aPointSize, vPointSize);
2364
2365 uint32_t *pPrimID = (uint32_t *)&primID;
2366
2367 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2368 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2369 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2370
2371 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2372 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2373 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2374
2375 // scan remaining valid prims and bin each separately
2376 const SWR_BACKEND_STATE& backendState = state.backendState;
2377 DWORD primIndex;
2378 while (_BitScanForward(&primIndex, primMask))
2379 {
2380 uint32_t linkageCount = backendState.numAttributes;
2381 uint32_t numScalarAttribs = linkageCount * 4;
2382
2383 BE_WORK work;
2384 work.type = DRAW;
2385
2386 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2387
2388 desc.triFlags.frontFacing = 1;
2389 desc.triFlags.primID = pPrimID[primIndex];
2390 desc.triFlags.pointSize = aPointSize[primIndex];
2391 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2392
2393 work.pfnWork = RasterizeTriPoint;
2394
2395 auto pArena = pDC->pArena;
2396 SWR_ASSERT(pArena != nullptr);
2397
2398 // store active attribs
2399 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2400 desc.numAttribs = linkageCount;
2401 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2402
2403 // store point vertex data
2404 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2405 desc.pTriBuffer = pTriBuffer;
2406 *pTriBuffer++ = aPrimVertsX[primIndex];
2407 *pTriBuffer++ = aPrimVertsY[primIndex];
2408 *pTriBuffer = aPrimVertsZ[primIndex];
2409
2410 // store user clip distances
2411 if (rastState.clipDistanceMask)
2412 {
2413 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2414 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2415 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2416 }
2417
2418 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2419 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2420 {
2421 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2422 {
2423 #if KNOB_ENABLE_TOSS_POINTS
2424 if (!KNOB_TOSS_SETUP_TRIS)
2425 #endif
2426 {
2427 pTileMgr->enqueue(x, y, &work);
2428 }
2429 }
2430 }
2431
2432 primMask &= ~(1 << primIndex);
2433 }
2434 }
2435
2436
2437
2438
2439 RDTSC_STOP(FEBinPoints, 1, 0);
2440 }
2441
2442 //////////////////////////////////////////////////////////////////////////
2443 /// @brief Bin SIMD lines to the backend.
2444 /// @param pDC - pointer to draw context.
2445 /// @param pa - The primitive assembly object.
2446 /// @param workerId - thread's worker id. Even thread has a unique id.
2447 /// @param tri - Contains line position data for SIMDs worth of points.
2448 /// @param primID - Primitive ID for each line.
2449 void BinLines(
2450 DRAW_CONTEXT *pDC,
2451 PA_STATE& pa,
2452 uint32_t workerId,
2453 simdvector prim[],
2454 uint32_t primMask,
2455 simdscalari primID)
2456 {
2457 RDTSC_START(FEBinLines);
2458
2459 const API_STATE& state = GetApiState(pDC);
2460 const SWR_RASTSTATE& rastState = state.rastState;
2461 const SWR_FRONTEND_STATE& feState = state.frontendState;
2462 const SWR_GS_STATE& gsState = state.gsState;
2463
2464 // Select attribute processor
2465 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2466 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2467
2468 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2469 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2470
2471 if (!feState.vpTransformDisable)
2472 {
2473 // perspective divide
2474 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2475 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2476
2477 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2478 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2479
2480 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2481 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2482
2483 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2484 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2485
2486 // viewport transform to screen coords
2487 viewportTransform<2>(prim, state.vpMatrix[0]);
2488 }
2489
2490 // adjust for pixel center location
2491 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2492 prim[0].x = _simd_add_ps(prim[0].x, offset);
2493 prim[0].y = _simd_add_ps(prim[0].y, offset);
2494
2495 prim[1].x = _simd_add_ps(prim[1].x, offset);
2496 prim[1].y = _simd_add_ps(prim[1].y, offset);
2497
2498 // convert to fixed point
2499 simdscalari vXi[2], vYi[2];
2500 vXi[0] = fpToFixedPointVertical(prim[0].x);
2501 vYi[0] = fpToFixedPointVertical(prim[0].y);
2502 vXi[1] = fpToFixedPointVertical(prim[1].x);
2503 vYi[1] = fpToFixedPointVertical(prim[1].y);
2504
2505 // compute x-major vs y-major mask
2506 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2507 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2508 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2509 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2510
2511 // cull zero-length lines
2512 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2513 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2514
2515 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2516
2517 uint32_t *pPrimID = (uint32_t *)&primID;
2518
2519 simdscalar vUnused = _simd_setzero_ps();
2520
2521 // Calc bounding box of lines
2522 simdBBox bbox;
2523 bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2524 bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2525 bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2526 bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2527
2528 // bloat bbox by line width along minor axis
2529 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2530 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2531 simdBBox bloatBox;
2532 bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2533 bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2534 bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2535 bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2536
2537 bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2538 bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2539 bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2540 bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2541
2542 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2543 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2544 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2545 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2546 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2547
2548 // Cull prims completely outside scissor
2549 {
2550 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2551 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2552 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2553 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2554 primMask = primMask & ~maskOutsideScissor;
2555 }
2556
2557 if (!primMask)
2558 {
2559 goto endBinLines;
2560 }
2561
2562 // Convert triangle bbox to macrotile units.
2563 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2564 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2565 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2566 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2567
2568 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2569 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2570 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2571 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2572 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2573
2574 // transpose verts needed for backend
2575 /// @todo modify BE to take non-transformed verts
2576 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2577 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2578 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2579 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2580 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2581
2582 // store render target array index
2583 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2584 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2585 {
2586 simdvector vRtai[2];
2587 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2588 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2589 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2590 }
2591 else
2592 {
2593 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2594 }
2595
2596 // scan remaining valid prims and bin each separately
2597 DWORD primIndex;
2598 while (_BitScanForward(&primIndex, primMask))
2599 {
2600 uint32_t linkageCount = state.backendState.numAttributes;
2601 uint32_t numScalarAttribs = linkageCount * 4;
2602
2603 BE_WORK work;
2604 work.type = DRAW;
2605
2606 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2607
2608 desc.triFlags.frontFacing = 1;
2609 desc.triFlags.primID = pPrimID[primIndex];
2610 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2611 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2612
2613 work.pfnWork = RasterizeLine;
2614
2615 auto pArena = pDC->pArena;
2616 SWR_ASSERT(pArena != nullptr);
2617
2618 // store active attribs
2619 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2620 desc.numAttribs = linkageCount;
2621 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2622
2623 // store line vertex data
2624 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2625 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2626 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2627 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2628 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2629
2630 // store user clip distances
2631 if (rastState.clipDistanceMask)
2632 {
2633 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2634 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2635 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2636 }
2637
2638 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2639 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2640 {
2641 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2642 {
2643 #if KNOB_ENABLE_TOSS_POINTS
2644 if (!KNOB_TOSS_SETUP_TRIS)
2645 #endif
2646 {
2647 pTileMgr->enqueue(x, y, &work);
2648 }
2649 }
2650 }
2651
2652 primMask &= ~(1 << primIndex);
2653 }
2654
2655 endBinLines:
2656
2657 RDTSC_STOP(FEBinLines, 1, 0);
2658 }