swr: [rasterizer core] TemplateArgUnroller
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "utils.h"
37 #include "threads.h"
38 #include "pa.h"
39 #include "clip.h"
40 #include "tilemgr.h"
41 #include "tessellator.h"
42
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Helper macro to generate a bitmask
45 static INLINE uint32_t GenMask(uint32_t numBits)
46 {
47 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
48 return ((1U << numBits) - 1);
49 }
50
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief Offsets added to post-viewport vertex positions based on
53 /// raster state.
54 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
55 {
56 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
57 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
58 };
59
60 //////////////////////////////////////////////////////////////////////////
61 /// @brief FE handler for SwrSync.
62 /// @param pContext - pointer to SWR context.
63 /// @param pDC - pointer to draw context.
64 /// @param workerId - thread's worker id. Even thread has a unique id.
65 /// @param pUserData - Pointer to user data passed back to sync callback.
66 /// @todo This should go away when we switch this to use compute threading.
67 void ProcessSync(
68 SWR_CONTEXT *pContext,
69 DRAW_CONTEXT *pDC,
70 uint32_t workerId,
71 void *pUserData)
72 {
73 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
74 BE_WORK work;
75 work.type = SYNC;
76 work.pfnWork = ProcessSyncBE;
77 work.desc.sync = *pSync;
78
79 MacroTileMgr *pTileMgr = pDC->pTileMgr;
80 pTileMgr->enqueue(0, 0, &work);
81 }
82
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrGetStats.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to stats callback.
89 /// @todo This should go away when we switch this to use compute threading.
90 void ProcessQueryStats(
91 SWR_CONTEXT *pContext,
92 DRAW_CONTEXT *pDC,
93 uint32_t workerId,
94 void *pUserData)
95 {
96 QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
97 BE_WORK work;
98 work.type = QUERYSTATS;
99 work.pfnWork = ProcessQueryStatsBE;
100 work.desc.queryStats = *pQueryStats;
101
102 MacroTileMgr *pTileMgr = pDC->pTileMgr;
103 pTileMgr->enqueue(0, 0, &work);
104 }
105
106 //////////////////////////////////////////////////////////////////////////
107 /// @brief FE handler for SwrClearRenderTarget.
108 /// @param pContext - pointer to SWR context.
109 /// @param pDC - pointer to draw context.
110 /// @param workerId - thread's worker id. Even thread has a unique id.
111 /// @param pUserData - Pointer to user data passed back to clear callback.
112 /// @todo This should go away when we switch this to use compute threading.
113 void ProcessClear(
114 SWR_CONTEXT *pContext,
115 DRAW_CONTEXT *pDC,
116 uint32_t workerId,
117 void *pUserData)
118 {
119 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
120 MacroTileMgr *pTileMgr = pDC->pTileMgr;
121
122 const API_STATE& state = GetApiState(pDC);
123
124 // queue a clear to each macro tile
125 // compute macro tile bounds for the current scissor/viewport
126 uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
127 uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
128 uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
129 uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
130
131 BE_WORK work;
132 work.type = CLEAR;
133 work.pfnWork = ProcessClearBE;
134 work.desc.clear = *pClear;
135
136 for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
137 {
138 for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
139 {
140 pTileMgr->enqueue(x, y, &work);
141 }
142 }
143 }
144
145 //////////////////////////////////////////////////////////////////////////
146 /// @brief FE handler for SwrStoreTiles.
147 /// @param pContext - pointer to SWR context.
148 /// @param pDC - pointer to draw context.
149 /// @param workerId - thread's worker id. Even thread has a unique id.
150 /// @param pUserData - Pointer to user data passed back to callback.
151 /// @todo This should go away when we switch this to use compute threading.
152 void ProcessStoreTiles(
153 SWR_CONTEXT *pContext,
154 DRAW_CONTEXT *pDC,
155 uint32_t workerId,
156 void *pUserData)
157 {
158 RDTSC_START(FEProcessStoreTiles);
159 STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
160 MacroTileMgr *pTileMgr = pDC->pTileMgr;
161
162 const API_STATE& state = GetApiState(pDC);
163
164 // queue a store to each macro tile
165 // compute macro tile bounds for the current render target
166 const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
167 const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
168
169 uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
170 uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
171
172 // store tiles
173 BE_WORK work;
174 work.type = STORETILES;
175 work.pfnWork = ProcessStoreTileBE;
176 work.desc.storeTiles = *pStore;
177
178 for (uint32_t x = 0; x < numMacroTilesX; ++x)
179 {
180 for (uint32_t y = 0; y < numMacroTilesY; ++y)
181 {
182 pTileMgr->enqueue(x, y, &work);
183 }
184 }
185
186 RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
187 }
188
189 //////////////////////////////////////////////////////////////////////////
190 /// @brief FE handler for SwrInvalidateTiles.
191 /// @param pContext - pointer to SWR context.
192 /// @param pDC - pointer to draw context.
193 /// @param workerId - thread's worker id. Even thread has a unique id.
194 /// @param pUserData - Pointer to user data passed back to callback.
195 /// @todo This should go away when we switch this to use compute threading.
196 void ProcessDiscardInvalidateTiles(
197 SWR_CONTEXT *pContext,
198 DRAW_CONTEXT *pDC,
199 uint32_t workerId,
200 void *pUserData)
201 {
202 RDTSC_START(FEProcessInvalidateTiles);
203 DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
204 MacroTileMgr *pTileMgr = pDC->pTileMgr;
205
206 SWR_RECT rect;
207
208 if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
209 {
210 // Valid rect
211 rect = pInv->rect;
212 }
213 else
214 {
215 // Use viewport dimensions
216 const API_STATE& state = GetApiState(pDC);
217
218 rect.left = (uint32_t)state.vp[0].x;
219 rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
220 rect.top = (uint32_t)state.vp[0].y;
221 rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
222 }
223
224 // queue a store to each macro tile
225 // compute macro tile bounds for the current render target
226 uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
227 uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
228
229 // Setup region assuming full tiles
230 uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
231 uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
232
233 uint32_t macroTileEndX = rect.right / macroWidth;
234 uint32_t macroTileEndY = rect.bottom / macroHeight;
235
236 if (pInv->fullTilesOnly == false)
237 {
238 // include partial tiles
239 macroTileStartX = rect.left / macroWidth;
240 macroTileStartY = rect.top / macroHeight;
241
242 macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
243 macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
244 }
245
246 SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
247 SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
248
249 macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
250 macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
251
252 // load tiles
253 BE_WORK work;
254 work.type = DISCARDINVALIDATETILES;
255 work.pfnWork = ProcessDiscardInvalidateTilesBE;
256 work.desc.discardInvalidateTiles = *pInv;
257
258 for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
259 {
260 for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
261 {
262 pTileMgr->enqueue(x, y, &work);
263 }
264 }
265
266 RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
267 }
268
269 //////////////////////////////////////////////////////////////////////////
270 /// @brief Computes the number of primitives given the number of verts.
271 /// @param mode - primitive topology for draw operation.
272 /// @param numPrims - number of vertices or indices for draw.
273 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
274 uint32_t GetNumPrims(
275 PRIMITIVE_TOPOLOGY mode,
276 uint32_t numPrims)
277 {
278 switch (mode)
279 {
280 case TOP_POINT_LIST: return numPrims;
281 case TOP_TRIANGLE_LIST: return numPrims / 3;
282 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
283 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
284 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
285 case TOP_QUAD_LIST: return numPrims / 4;
286 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
287 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
288 case TOP_LINE_LIST: return numPrims / 2;
289 case TOP_LINE_LOOP: return numPrims;
290 case TOP_RECT_LIST: return numPrims / 3;
291 case TOP_LINE_LIST_ADJ: return numPrims / 4;
292 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
293 case TOP_TRI_LIST_ADJ: return numPrims / 6;
294 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
295
296 case TOP_PATCHLIST_1:
297 case TOP_PATCHLIST_2:
298 case TOP_PATCHLIST_3:
299 case TOP_PATCHLIST_4:
300 case TOP_PATCHLIST_5:
301 case TOP_PATCHLIST_6:
302 case TOP_PATCHLIST_7:
303 case TOP_PATCHLIST_8:
304 case TOP_PATCHLIST_9:
305 case TOP_PATCHLIST_10:
306 case TOP_PATCHLIST_11:
307 case TOP_PATCHLIST_12:
308 case TOP_PATCHLIST_13:
309 case TOP_PATCHLIST_14:
310 case TOP_PATCHLIST_15:
311 case TOP_PATCHLIST_16:
312 case TOP_PATCHLIST_17:
313 case TOP_PATCHLIST_18:
314 case TOP_PATCHLIST_19:
315 case TOP_PATCHLIST_20:
316 case TOP_PATCHLIST_21:
317 case TOP_PATCHLIST_22:
318 case TOP_PATCHLIST_23:
319 case TOP_PATCHLIST_24:
320 case TOP_PATCHLIST_25:
321 case TOP_PATCHLIST_26:
322 case TOP_PATCHLIST_27:
323 case TOP_PATCHLIST_28:
324 case TOP_PATCHLIST_29:
325 case TOP_PATCHLIST_30:
326 case TOP_PATCHLIST_31:
327 case TOP_PATCHLIST_32:
328 return numPrims / (mode - TOP_PATCHLIST_BASE);
329
330 case TOP_POLYGON:
331 case TOP_POINT_LIST_BF:
332 case TOP_LINE_STRIP_CONT:
333 case TOP_LINE_STRIP_BF:
334 case TOP_LINE_STRIP_CONT_BF:
335 case TOP_TRIANGLE_FAN_NOSTIPPLE:
336 case TOP_TRI_STRIP_REVERSE:
337 case TOP_PATCHLIST_BASE:
338 case TOP_UNKNOWN:
339 SWR_ASSERT(false, "Unsupported topology: %d", mode);
340 return 0;
341 }
342
343 return 0;
344 }
345
346 //////////////////////////////////////////////////////////////////////////
347 /// @brief Computes the number of verts given the number of primitives.
348 /// @param mode - primitive topology for draw operation.
349 /// @param numPrims - number of primitives for draw.
350 uint32_t GetNumVerts(
351 PRIMITIVE_TOPOLOGY mode,
352 uint32_t numPrims)
353 {
354 switch (mode)
355 {
356 case TOP_POINT_LIST: return numPrims;
357 case TOP_TRIANGLE_LIST: return numPrims * 3;
358 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
359 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
360 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
361 case TOP_QUAD_LIST: return numPrims * 4;
362 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
363 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
364 case TOP_LINE_LIST: return numPrims * 2;
365 case TOP_LINE_LOOP: return numPrims;
366 case TOP_RECT_LIST: return numPrims * 3;
367 case TOP_LINE_LIST_ADJ: return numPrims * 4;
368 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
369 case TOP_TRI_LIST_ADJ: return numPrims * 6;
370 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
371
372 case TOP_PATCHLIST_1:
373 case TOP_PATCHLIST_2:
374 case TOP_PATCHLIST_3:
375 case TOP_PATCHLIST_4:
376 case TOP_PATCHLIST_5:
377 case TOP_PATCHLIST_6:
378 case TOP_PATCHLIST_7:
379 case TOP_PATCHLIST_8:
380 case TOP_PATCHLIST_9:
381 case TOP_PATCHLIST_10:
382 case TOP_PATCHLIST_11:
383 case TOP_PATCHLIST_12:
384 case TOP_PATCHLIST_13:
385 case TOP_PATCHLIST_14:
386 case TOP_PATCHLIST_15:
387 case TOP_PATCHLIST_16:
388 case TOP_PATCHLIST_17:
389 case TOP_PATCHLIST_18:
390 case TOP_PATCHLIST_19:
391 case TOP_PATCHLIST_20:
392 case TOP_PATCHLIST_21:
393 case TOP_PATCHLIST_22:
394 case TOP_PATCHLIST_23:
395 case TOP_PATCHLIST_24:
396 case TOP_PATCHLIST_25:
397 case TOP_PATCHLIST_26:
398 case TOP_PATCHLIST_27:
399 case TOP_PATCHLIST_28:
400 case TOP_PATCHLIST_29:
401 case TOP_PATCHLIST_30:
402 case TOP_PATCHLIST_31:
403 case TOP_PATCHLIST_32:
404 return numPrims * (mode - TOP_PATCHLIST_BASE);
405
406 case TOP_POLYGON:
407 case TOP_POINT_LIST_BF:
408 case TOP_LINE_STRIP_CONT:
409 case TOP_LINE_STRIP_BF:
410 case TOP_LINE_STRIP_CONT_BF:
411 case TOP_TRIANGLE_FAN_NOSTIPPLE:
412 case TOP_TRI_STRIP_REVERSE:
413 case TOP_PATCHLIST_BASE:
414 case TOP_UNKNOWN:
415 SWR_ASSERT(false, "Unsupported topology: %d", mode);
416 return 0;
417 }
418
419 return 0;
420 }
421
422 //////////////////////////////////////////////////////////////////////////
423 /// @brief Return number of verts per primitive.
424 /// @param topology - topology
425 /// @param includeAdjVerts - include adjacent verts in primitive vertices
426 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
427 {
428 uint32_t numVerts = 0;
429 switch (topology)
430 {
431 case TOP_POINT_LIST:
432 case TOP_POINT_LIST_BF:
433 numVerts = 1;
434 break;
435 case TOP_LINE_LIST:
436 case TOP_LINE_STRIP:
437 case TOP_LINE_LIST_ADJ:
438 case TOP_LINE_LOOP:
439 case TOP_LINE_STRIP_CONT:
440 case TOP_LINE_STRIP_BF:
441 case TOP_LISTSTRIP_ADJ:
442 numVerts = 2;
443 break;
444 case TOP_TRIANGLE_LIST:
445 case TOP_TRIANGLE_STRIP:
446 case TOP_TRIANGLE_FAN:
447 case TOP_TRI_LIST_ADJ:
448 case TOP_TRI_STRIP_ADJ:
449 case TOP_TRI_STRIP_REVERSE:
450 case TOP_RECT_LIST:
451 numVerts = 3;
452 break;
453 case TOP_QUAD_LIST:
454 case TOP_QUAD_STRIP:
455 numVerts = 4;
456 break;
457 case TOP_PATCHLIST_1:
458 case TOP_PATCHLIST_2:
459 case TOP_PATCHLIST_3:
460 case TOP_PATCHLIST_4:
461 case TOP_PATCHLIST_5:
462 case TOP_PATCHLIST_6:
463 case TOP_PATCHLIST_7:
464 case TOP_PATCHLIST_8:
465 case TOP_PATCHLIST_9:
466 case TOP_PATCHLIST_10:
467 case TOP_PATCHLIST_11:
468 case TOP_PATCHLIST_12:
469 case TOP_PATCHLIST_13:
470 case TOP_PATCHLIST_14:
471 case TOP_PATCHLIST_15:
472 case TOP_PATCHLIST_16:
473 case TOP_PATCHLIST_17:
474 case TOP_PATCHLIST_18:
475 case TOP_PATCHLIST_19:
476 case TOP_PATCHLIST_20:
477 case TOP_PATCHLIST_21:
478 case TOP_PATCHLIST_22:
479 case TOP_PATCHLIST_23:
480 case TOP_PATCHLIST_24:
481 case TOP_PATCHLIST_25:
482 case TOP_PATCHLIST_26:
483 case TOP_PATCHLIST_27:
484 case TOP_PATCHLIST_28:
485 case TOP_PATCHLIST_29:
486 case TOP_PATCHLIST_30:
487 case TOP_PATCHLIST_31:
488 case TOP_PATCHLIST_32:
489 numVerts = topology - TOP_PATCHLIST_BASE;
490 break;
491 default:
492 SWR_ASSERT(false, "Unsupported topology: %d", topology);
493 break;
494 }
495
496 if (includeAdjVerts)
497 {
498 switch (topology)
499 {
500 case TOP_LISTSTRIP_ADJ:
501 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
502 case TOP_TRI_STRIP_ADJ:
503 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
504 default: break;
505 }
506 }
507
508 return numVerts;
509 }
510
511 //////////////////////////////////////////////////////////////////////////
512 /// @brief Generate mask from remaining work.
513 /// @param numWorkItems - Number of items being worked on by a SIMD.
514 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
515 {
516 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
517 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
518 return _simd_castps_si(vMask(mask));
519 }
520
521 //////////////////////////////////////////////////////////////////////////
522 /// @brief StreamOut - Streams vertex data out to SO buffers.
523 /// Generally, we are only streaming out a SIMDs worth of triangles.
524 /// @param pDC - pointer to draw context.
525 /// @param workerId - thread's worker id. Even thread has a unique id.
526 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
527 static void StreamOut(
528 DRAW_CONTEXT* pDC,
529 PA_STATE& pa,
530 uint32_t workerId,
531 uint32_t* pPrimData,
532 uint32_t streamIndex)
533 {
534 RDTSC_START(FEStreamout);
535
536 SWR_CONTEXT* pContext = pDC->pContext;
537
538 const API_STATE& state = GetApiState(pDC);
539 const SWR_STREAMOUT_STATE &soState = state.soState;
540
541 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
542
543 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
544 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
545
546 SWR_STREAMOUT_CONTEXT soContext = { 0 };
547
548 // Setup buffer state pointers.
549 for (uint32_t i = 0; i < 4; ++i)
550 {
551 soContext.pBuffer[i] = &state.soBuffer[i];
552 }
553
554 uint32_t numPrims = pa.NumPrims();
555 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
556 {
557 DWORD slot = 0;
558 uint32_t soMask = soState.streamMasks[streamIndex];
559
560 // Write all entries into primitive data buffer for SOS.
561 while (_BitScanForward(&slot, soMask))
562 {
563 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
564 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
565 pa.AssembleSingle(paSlot, primIndex, attrib);
566
567 // Attribute offset is relative offset from start of vertex.
568 // Note that attributes start at slot 1 in the PA buffer. We need to write this
569 // to prim data starting at slot 0. Which is why we do (slot - 1).
570 // Also note: GL works slightly differently, and needs slot 0
571 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
572
573 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
574 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
575 {
576 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
577
578 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
579 }
580 soMask &= ~(1 << slot);
581 }
582
583 // Update pPrimData pointer
584 soContext.pPrimData = pPrimData;
585
586 // Call SOS
587 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
588 state.pfnSoFunc[streamIndex](soContext);
589 }
590
591 // Update SO write offset. The driver provides memory for the update.
592 for (uint32_t i = 0; i < 4; ++i)
593 {
594 if (state.soBuffer[i].pWriteOffset)
595 {
596 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
597
598 // The SOS increments the existing write offset. So we don't want to increment
599 // the SoWriteOffset stat using an absolute offset instead of relative.
600 SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
601 }
602 }
603
604 UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
605 UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
606
607 RDTSC_STOP(FEStreamout, 1, 0);
608 }
609
610 //////////////////////////////////////////////////////////////////////////
611 /// @brief Computes number of invocations. The current index represents
612 /// the start of the SIMD. The max index represents how much work
613 /// items are remaining. If there is less then a SIMD's left of work
614 /// then return the remaining amount of work.
615 /// @param curIndex - The start index for the SIMD.
616 /// @param maxIndex - The last index for all work items.
617 static INLINE uint32_t GetNumInvocations(
618 uint32_t curIndex,
619 uint32_t maxIndex)
620 {
621 uint32_t remainder = (maxIndex - curIndex);
622 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
623 }
624
625 //////////////////////////////////////////////////////////////////////////
626 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
627 /// The geometry shader will loop over each active streamout buffer, assembling
628 /// primitives for the downstream stages. When multistream output is enabled,
629 /// the generated stream ID buffer from the GS needs to be converted to a cut
630 /// buffer for the primitive assembler.
631 /// @param stream - stream id to generate the cut buffer for
632 /// @param pStreamIdBase - pointer to the stream ID buffer
633 /// @param numEmittedVerts - Number of total verts emitted by the GS
634 /// @param pCutBuffer - output buffer to write cuts to
635 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
636 {
637 SWR_ASSERT(stream < MAX_SO_STREAMS);
638
639 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
640 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
641
642 for (uint32_t b = 0; b < numOutputBytes; ++b)
643 {
644 uint8_t curInputByte = pStreamIdBase[2*b];
645 uint8_t outByte = 0;
646 for (uint32_t i = 0; i < 4; ++i)
647 {
648 if ((curInputByte & 0x3) != stream)
649 {
650 outByte |= (1 << i);
651 }
652 curInputByte >>= 2;
653 }
654
655 curInputByte = pStreamIdBase[2 * b + 1];
656 for (uint32_t i = 0; i < 4; ++i)
657 {
658 if ((curInputByte & 0x3) != stream)
659 {
660 outByte |= (1 << (i + 4));
661 }
662 curInputByte >>= 2;
663 }
664
665 *pCutBuffer++ = outByte;
666 }
667 }
668
669 THREAD SWR_GS_CONTEXT tlsGsContext;
670
671 //////////////////////////////////////////////////////////////////////////
672 /// @brief Implements GS stage.
673 /// @param pDC - pointer to draw context.
674 /// @param workerId - thread's worker id. Even thread has a unique id.
675 /// @param pa - The primitive assembly object.
676 /// @param pGsOut - output stream for GS
677 template <
678 typename HasStreamOutT,
679 typename HasRastT>
680 static void GeometryShaderStage(
681 DRAW_CONTEXT *pDC,
682 uint32_t workerId,
683 PA_STATE& pa,
684 void* pGsOut,
685 void* pCutBuffer,
686 void* pStreamCutBuffer,
687 uint32_t* pSoPrimData,
688 simdscalari primID)
689 {
690 RDTSC_START(FEGeometryShader);
691
692 SWR_CONTEXT* pContext = pDC->pContext;
693
694 const API_STATE& state = GetApiState(pDC);
695 const SWR_GS_STATE* pState = &state.gsState;
696
697 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
698 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
699
700 tlsGsContext.pStream = (uint8_t*)pGsOut;
701 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
702 tlsGsContext.PrimitiveID = primID;
703
704 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
705 simdvector attrib[MAX_ATTRIBUTES];
706
707 // assemble all attributes for the input primitive
708 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
709 {
710 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
711 pa.Assemble(attribSlot, attrib);
712
713 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
714 {
715 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
716 }
717 }
718
719 // assemble position
720 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
721 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
722 {
723 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
724 }
725
726 const uint32_t vertexStride = sizeof(simdvertex);
727 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
728 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
729 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
730 uint32_t cutPrimStride;
731 uint32_t cutInstanceStride;
732
733 if (pState->isSingleStream)
734 {
735 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
736 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
737 }
738 else
739 {
740 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
741 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
742 }
743
744 // record valid prims from the frontend to avoid over binning the newly generated
745 // prims from the GS
746 uint32_t numInputPrims = pa.NumPrims();
747
748 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
749 {
750 tlsGsContext.InstanceID = instance;
751 tlsGsContext.mask = GenerateMask(numInputPrims);
752
753 // execute the geometry shader
754 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
755
756 tlsGsContext.pStream += instanceStride;
757 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
758 }
759
760 // set up new binner and state for the GS output topology
761 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
762 if (HasRastT::value)
763 {
764 switch (pState->outputTopology)
765 {
766 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
767 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
768 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
769 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
770 }
771 }
772
773 // foreach input prim:
774 // - setup a new PA based on the emitted verts for that prim
775 // - loop over the new verts, calling PA to assemble each prim
776 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
777 uint32_t* pPrimitiveId = (uint32_t*)&primID;
778
779 uint32_t totalPrimsGenerated = 0;
780 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
781 {
782 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
783 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
784 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
785 {
786 uint32_t numEmittedVerts = pVertexCount[inputPrim];
787 if (numEmittedVerts == 0)
788 {
789 continue;
790 }
791
792 uint8_t* pBase = pInstanceBase + instance * instanceStride;
793 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
794
795 DWORD numAttribs;
796 if (_BitScanReverse(&numAttribs, state.feAttribMask))
797 {
798 numAttribs++;
799 }
800 else
801 {
802 numAttribs = 0;
803 }
804
805 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
806 {
807 bool processCutVerts = false;
808
809 uint8_t* pCutBuffer = pCutBase;
810
811 // assign default stream ID, only relevant when GS is outputting a single stream
812 uint32_t streamID = 0;
813 if (pState->isSingleStream)
814 {
815 processCutVerts = true;
816 streamID = pState->singleStreamID;
817 if (streamID != stream) continue;
818 }
819 else
820 {
821 // early exit if this stream is not enabled for streamout
822 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
823 {
824 continue;
825 }
826
827 // multi-stream output, need to translate StreamID buffer to a cut buffer
828 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
829 pCutBuffer = (uint8_t*)pStreamCutBuffer;
830 processCutVerts = false;
831 }
832
833 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
834
835 while (gsPa.GetNextStreamOutput())
836 {
837 do
838 {
839 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
840
841 if (assemble)
842 {
843 totalPrimsGenerated += gsPa.NumPrims();
844
845 if (HasStreamOutT::value)
846 {
847 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
848 }
849
850 if (HasRastT::value && state.soState.streamToRasterizer == stream)
851 {
852 simdscalari vPrimId;
853 // pull primitiveID from the GS output if available
854 if (state.gsState.emitsPrimitiveID)
855 {
856 simdvector primIdAttrib[3];
857 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
858 vPrimId = _simd_castps_si(primIdAttrib[0].x);
859 }
860 else
861 {
862 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
863 }
864
865 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
866 }
867 }
868 } while (gsPa.NextPrim());
869 }
870 }
871 }
872 }
873
874 // update GS pipeline stats
875 UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
876 UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
877
878 RDTSC_STOP(FEGeometryShader, 1, 0);
879 }
880
881 //////////////////////////////////////////////////////////////////////////
882 /// @brief Allocate GS buffers
883 /// @param pDC - pointer to draw context.
884 /// @param state - API state
885 /// @param ppGsOut - pointer to GS output buffer allocation
886 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
887 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
888 void **ppStreamCutBuffer)
889 {
890 auto pArena = pDC->pArena;
891 SWR_ASSERT(pArena != nullptr);
892 SWR_ASSERT(state.gsState.gsEnable);
893 // allocate arena space to hold GS output verts
894 // @todo pack attribs
895 // @todo support multiple streams
896 const uint32_t vertexStride = sizeof(simdvertex);
897 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
898 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
899 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
900
901 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
902 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
903 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
904 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
905
906 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
907 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
908
909 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
910 if (state.gsState.isSingleStream)
911 {
912 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
913 *ppStreamCutBuffer = nullptr;
914 }
915 else
916 {
917 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
918 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
919 }
920
921 }
922
923 //////////////////////////////////////////////////////////////////////////
924 /// @brief Contains all data generated by the HS and passed to the
925 /// tessellator and DS.
926 struct TessellationThreadLocalData
927 {
928 SWR_HS_CONTEXT hsContext;
929 ScalarPatch patchData[KNOB_SIMD_WIDTH];
930 void* pTxCtx;
931 size_t tsCtxSize;
932
933 simdscalar* pDSOutput;
934 size_t numDSOutputVectors;
935 };
936
937 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
938
939 //////////////////////////////////////////////////////////////////////////
940 /// @brief Allocate tessellation data for this worker thread.
941 INLINE
942 static void AllocateTessellationData(SWR_CONTEXT* pContext)
943 {
944 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
945 if (gt_pTessellationThreadData == nullptr)
946 {
947 gt_pTessellationThreadData = (TessellationThreadLocalData*)
948 _aligned_malloc(sizeof(TessellationThreadLocalData), 64);
949 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
950 }
951 }
952
953 //////////////////////////////////////////////////////////////////////////
954 /// @brief Implements Tessellation Stages.
955 /// @param pDC - pointer to draw context.
956 /// @param workerId - thread's worker id. Even thread has a unique id.
957 /// @param pa - The primitive assembly object.
958 /// @param pGsOut - output stream for GS
959 template <
960 typename HasGeometryShaderT,
961 typename HasStreamOutT,
962 typename HasRastT>
963 static void TessellationStages(
964 DRAW_CONTEXT *pDC,
965 uint32_t workerId,
966 PA_STATE& pa,
967 void* pGsOut,
968 void* pCutBuffer,
969 void* pCutStreamBuffer,
970 uint32_t* pSoPrimData,
971 simdscalari primID)
972 {
973 const API_STATE& state = GetApiState(pDC);
974 const SWR_TS_STATE& tsState = state.tsState;
975 SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
976
977 SWR_ASSERT(gt_pTessellationThreadData);
978
979 HANDLE tsCtx = TSInitCtx(
980 tsState.domain,
981 tsState.partitioning,
982 tsState.tsOutputTopology,
983 gt_pTessellationThreadData->pTxCtx,
984 gt_pTessellationThreadData->tsCtxSize);
985 if (tsCtx == nullptr)
986 {
987 gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64);
988 tsCtx = TSInitCtx(
989 tsState.domain,
990 tsState.partitioning,
991 tsState.tsOutputTopology,
992 gt_pTessellationThreadData->pTxCtx,
993 gt_pTessellationThreadData->tsCtxSize);
994 }
995 SWR_ASSERT(tsCtx);
996
997 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
998 if (HasRastT::value)
999 {
1000 switch (tsState.postDSTopology)
1001 {
1002 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1003 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1004 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1005 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1006 }
1007 }
1008
1009 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1010 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1011 hsContext.PrimitiveID = primID;
1012
1013 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1014 // Max storage for one attribute for an entire simdprimitive
1015 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1016
1017 // assemble all attributes for the input primitives
1018 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1019 {
1020 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1021 pa.Assemble(attribSlot, simdattrib);
1022
1023 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1024 {
1025 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1026 }
1027 }
1028
1029 #if defined(_DEBUG)
1030 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1031 #endif
1032
1033 uint32_t numPrims = pa.NumPrims();
1034 hsContext.mask = GenerateMask(numPrims);
1035
1036 // Run the HS
1037 RDTSC_START(FEHullShader);
1038 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1039 RDTSC_STOP(FEHullShader, 0, 0);
1040
1041 UPDATE_STAT(HsInvocations, numPrims);
1042
1043 const uint32_t* pPrimId = (const uint32_t*)&primID;
1044
1045 for (uint32_t p = 0; p < numPrims; ++p)
1046 {
1047 // Run Tessellator
1048 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1049 RDTSC_START(FETessellation);
1050 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1051 RDTSC_STOP(FETessellation, 0, 0);
1052
1053 if (tsData.NumPrimitives == 0)
1054 {
1055 continue;
1056 }
1057 SWR_ASSERT(tsData.NumDomainPoints);
1058
1059 // Allocate DS Output memory
1060 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1061 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1062 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1063 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1064 {
1065 _aligned_free(gt_pTessellationThreadData->pDSOutput);
1066 gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(requiredAllocSize, 64);
1067 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1068 }
1069 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1070 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1071
1072 #if defined(_DEBUG)
1073 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1074 #endif
1075
1076 // Run Domain Shader
1077 SWR_DS_CONTEXT dsContext;
1078 dsContext.PrimitiveID = pPrimId[p];
1079 dsContext.pCpIn = &hsContext.pCPout[p];
1080 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1081 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1082 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1083 dsContext.vectorStride = requiredDSVectorInvocations;
1084
1085 uint32_t dsInvocations = 0;
1086
1087 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1088 {
1089 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1090
1091 RDTSC_START(FEDomainShader);
1092 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1093 RDTSC_STOP(FEDomainShader, 0, 0);
1094
1095 dsInvocations += KNOB_SIMD_WIDTH;
1096 }
1097 UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1098
1099 PA_TESS tessPa(
1100 pDC,
1101 dsContext.pOutputData,
1102 dsContext.vectorStride,
1103 tsState.numDsOutputAttribs,
1104 tsData.ppIndices,
1105 tsData.NumPrimitives,
1106 tsState.postDSTopology);
1107
1108 while (tessPa.HasWork())
1109 {
1110 if (HasGeometryShaderT::value)
1111 {
1112 GeometryShaderStage<HasStreamOutT, HasRastT>(
1113 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1114 _simd_set1_epi32(dsContext.PrimitiveID));
1115 }
1116 else
1117 {
1118 if (HasStreamOutT::value)
1119 {
1120 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1121 }
1122
1123 if (HasRastT::value)
1124 {
1125 simdvector prim[3]; // Only deal with triangles, lines, or points
1126 RDTSC_START(FEPAAssemble);
1127 #if SWR_ENABLE_ASSERTS
1128 bool assemble =
1129 #endif
1130 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1131 RDTSC_STOP(FEPAAssemble, 1, 0);
1132 SWR_ASSERT(assemble);
1133
1134 SWR_ASSERT(pfnClipFunc);
1135 pfnClipFunc(pDC, tessPa, workerId, prim,
1136 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1137 }
1138 }
1139
1140 tessPa.NextPrim();
1141
1142 } // while (tessPa.HasWork())
1143 } // for (uint32_t p = 0; p < numPrims; ++p)
1144
1145 TSDestroyCtx(tsCtx);
1146 }
1147
1148 //////////////////////////////////////////////////////////////////////////
1149 /// @brief FE handler for SwrDraw.
1150 /// @tparam IsIndexedT - Is indexed drawing enabled
1151 /// @tparam HasTessellationT - Is tessellation enabled
1152 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1153 /// @tparam HasStreamOutT - Is stream-out enabled
1154 /// @tparam HasRastT - Is rasterization enabled
1155 /// @param pContext - pointer to SWR context.
1156 /// @param pDC - pointer to draw context.
1157 /// @param workerId - thread's worker id.
1158 /// @param pUserData - Pointer to DRAW_WORK
1159 template <
1160 typename IsIndexedT,
1161 typename HasTessellationT,
1162 typename HasGeometryShaderT,
1163 typename HasStreamOutT,
1164 typename HasRastT>
1165 void ProcessDraw(
1166 SWR_CONTEXT *pContext,
1167 DRAW_CONTEXT *pDC,
1168 uint32_t workerId,
1169 void *pUserData)
1170 {
1171
1172 #if KNOB_ENABLE_TOSS_POINTS
1173 if (KNOB_TOSS_QUEUE_FE)
1174 {
1175 return;
1176 }
1177 #endif
1178
1179 RDTSC_START(FEProcessDraw);
1180
1181 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1182 const API_STATE& state = GetApiState(pDC);
1183 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1184 SWR_VS_CONTEXT vsContext;
1185 simdvertex vin;
1186
1187 int indexSize = 0;
1188 uint32_t endVertex = work.numVerts;
1189
1190 const int32_t* pLastRequestedIndex = nullptr;
1191 if (IsIndexedT::value)
1192 {
1193 switch (work.type)
1194 {
1195 case R32_UINT:
1196 indexSize = sizeof(uint32_t);
1197 pLastRequestedIndex = &(work.pIB[endVertex]);
1198 break;
1199 case R16_UINT:
1200 indexSize = sizeof(uint16_t);
1201 // nasty address offset to last index
1202 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1203 break;
1204 case R8_UINT:
1205 indexSize = sizeof(uint8_t);
1206 // nasty address offset to last index
1207 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1208 break;
1209 default:
1210 SWR_ASSERT(0);
1211 }
1212 }
1213 else
1214 {
1215 // No cuts, prune partial primitives.
1216 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1217 }
1218
1219 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1220 fetchInfo.pStreams = &state.vertexBuffers[0];
1221 fetchInfo.StartInstance = work.startInstance;
1222 fetchInfo.StartVertex = 0;
1223
1224 vsContext.pVin = &vin;
1225
1226 if (IsIndexedT::value)
1227 {
1228 fetchInfo.BaseVertex = work.baseVertex;
1229
1230 // if the entire index buffer isn't being consumed, set the last index
1231 // so that fetches < a SIMD wide will be masked off
1232 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1233 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1234 {
1235 fetchInfo.pLastIndex = pLastRequestedIndex;
1236 }
1237 }
1238 else
1239 {
1240 fetchInfo.StartVertex = work.startVertex;
1241 }
1242
1243 #ifdef KNOB_ENABLE_RDTSC
1244 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1245 #endif
1246
1247 void* pGsOut = nullptr;
1248 void* pCutBuffer = nullptr;
1249 void* pStreamCutBuffer = nullptr;
1250 if (HasGeometryShaderT::value)
1251 {
1252 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1253 }
1254
1255 if (HasTessellationT::value)
1256 {
1257 SWR_ASSERT(state.tsState.tsEnable == true);
1258 SWR_ASSERT(state.pfnHsFunc != nullptr);
1259 SWR_ASSERT(state.pfnDsFunc != nullptr);
1260
1261 AllocateTessellationData(pContext);
1262 }
1263 else
1264 {
1265 SWR_ASSERT(state.tsState.tsEnable == false);
1266 SWR_ASSERT(state.pfnHsFunc == nullptr);
1267 SWR_ASSERT(state.pfnDsFunc == nullptr);
1268 }
1269
1270 // allocate space for streamout input prim data
1271 uint32_t* pSoPrimData = nullptr;
1272 if (HasStreamOutT::value)
1273 {
1274 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1275
1276 // update the
1277 for (uint32_t i = 0; i < 4; ++i)
1278 {
1279 SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
1280 }
1281
1282 }
1283
1284 // choose primitive assembler
1285 PA_FACTORY<IsIndexedT> paFactory(pDC, state.topology, work.numVerts);
1286 PA_STATE& pa = paFactory.GetPA();
1287
1288 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1289 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1290 {
1291 simdscalari vIndex;
1292 uint32_t i = 0;
1293
1294 if (IsIndexedT::value)
1295 {
1296 fetchInfo.pIndices = work.pIB;
1297 }
1298 else
1299 {
1300 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1301 fetchInfo.pIndices = (const int32_t*)&vIndex;
1302 }
1303
1304 fetchInfo.CurInstance = instanceNum;
1305 vsContext.InstanceID = instanceNum;
1306
1307 while (pa.HasWork())
1308 {
1309 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1310 // So we need to keep this outside of (i < endVertex) check.
1311 simdmask* pvCutIndices = nullptr;
1312 if (IsIndexedT::value)
1313 {
1314 pvCutIndices = &pa.GetNextVsIndices();
1315 }
1316
1317 simdvertex& vout = pa.GetNextVsOutput();
1318 vsContext.pVout = &vout;
1319
1320 if (i < endVertex)
1321 {
1322
1323 // 1. Execute FS/VS for a single SIMD.
1324 RDTSC_START(FEFetchShader);
1325 state.pfnFetchFunc(fetchInfo, vin);
1326 RDTSC_STOP(FEFetchShader, 0, 0);
1327
1328 // forward fetch generated vertex IDs to the vertex shader
1329 vsContext.VertexID = fetchInfo.VertexID;
1330
1331 // Setup active mask for vertex shader.
1332 vsContext.mask = GenerateMask(endVertex - i);
1333
1334 // forward cut mask to the PA
1335 if (IsIndexedT::value)
1336 {
1337 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1338 }
1339
1340 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1341
1342 #if KNOB_ENABLE_TOSS_POINTS
1343 if (!KNOB_TOSS_FETCH)
1344 #endif
1345 {
1346 RDTSC_START(FEVertexShader);
1347 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1348 RDTSC_STOP(FEVertexShader, 0, 0);
1349
1350 UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1351 }
1352 }
1353
1354 // 2. Assemble primitives given the last two SIMD.
1355 do
1356 {
1357 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1358 // PaAssemble returns false if there is not enough verts to assemble.
1359 RDTSC_START(FEPAAssemble);
1360 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1361 RDTSC_STOP(FEPAAssemble, 1, 0);
1362
1363 #if KNOB_ENABLE_TOSS_POINTS
1364 if (!KNOB_TOSS_FETCH)
1365 #endif
1366 {
1367 #if KNOB_ENABLE_TOSS_POINTS
1368 if (!KNOB_TOSS_VS)
1369 #endif
1370 {
1371 if (assemble)
1372 {
1373 UPDATE_STAT(IaPrimitives, pa.NumPrims());
1374
1375 if (HasTessellationT::value)
1376 {
1377 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1378 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1379 }
1380 else if (HasGeometryShaderT::value)
1381 {
1382 GeometryShaderStage<HasStreamOutT, HasRastT>(
1383 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1384 }
1385 else
1386 {
1387 // If streamout is enabled then stream vertices out to memory.
1388 if (HasStreamOutT::value)
1389 {
1390 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1391 }
1392
1393 if (HasRastT::value)
1394 {
1395 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1396 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1397 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1398 }
1399 }
1400 }
1401 }
1402 }
1403 } while (pa.NextPrim());
1404
1405 i += KNOB_SIMD_WIDTH;
1406 if (IsIndexedT::value)
1407 {
1408 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1409 }
1410 else
1411 {
1412 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1413 }
1414 }
1415 pa.Reset();
1416 }
1417
1418 RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1419 }
1420
1421 struct FEDrawChooser
1422 {
1423 typedef PFN_FE_WORK_FUNC FuncType;
1424
1425 template <typename... ArgsB>
1426 static FuncType GetFunc()
1427 {
1428 return ProcessDraw<ArgsB...>;
1429 }
1430 };
1431
1432
1433 // Selector for correct templated Draw front-end function
1434 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1435 bool IsIndexed,
1436 bool HasTessellation,
1437 bool HasGeometryShader,
1438 bool HasStreamOut,
1439 bool HasRasterization)
1440 {
1441 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1442 }
1443
1444
1445 //////////////////////////////////////////////////////////////////////////
1446 /// @brief Processes attributes for the backend based on linkage mask and
1447 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1448 /// @param pDC - Draw context
1449 /// @param pa - Primitive Assembly state
1450 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1451 /// @param pLinkageMap - maps VS attribute slot to PS slot
1452 /// @param triIndex - Triangle to process attributes for
1453 /// @param pBuffer - Output result
1454 template<uint32_t NumVerts>
1455 INLINE void ProcessAttributes(
1456 DRAW_CONTEXT *pDC,
1457 PA_STATE&pa,
1458 uint32_t linkageMask,
1459 const uint8_t* pLinkageMap,
1460 uint32_t triIndex,
1461 float *pBuffer)
1462 {
1463 DWORD slot = 0;
1464 uint32_t mapIdx = 0;
1465 LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
1466 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1467
1468 while (_BitScanForward(&slot, linkageMask))
1469 {
1470 linkageMask &= ~(1 << slot); // done with this bit.
1471
1472 // compute absolute slot in vertex attrib array
1473 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
1474
1475 __m128 attrib[3]; // triangle attribs (always 4 wide)
1476 pa.AssembleSingle(inputSlot, triIndex, attrib);
1477
1478 if (_bittest(&constantInterpMask, mapIdx))
1479 {
1480 for (uint32_t i = 0; i < NumVerts; ++i)
1481 {
1482 _mm_store_ps(pBuffer, attrib[provokingVertex]);
1483 pBuffer += 4;
1484 }
1485 }
1486 else
1487 {
1488 for (uint32_t i = 0; i < NumVerts; ++i)
1489 {
1490 _mm_store_ps(pBuffer, attrib[i]);
1491 pBuffer += 4;
1492 }
1493 }
1494
1495 // pad out the attrib buffer to 3 verts to ensure the triangle
1496 // interpolation code in the pixel shader works correctly for the
1497 // 3 topologies - point, line, tri. This effectively zeros out the
1498 // effect of the missing vertices in the triangle interpolation.
1499 for (uint32_t i = NumVerts; i < 3; ++i)
1500 {
1501 _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
1502 pBuffer += 4;
1503 }
1504
1505 mapIdx++;
1506 }
1507 }
1508
1509 //////////////////////////////////////////////////////////////////////////
1510 /// @brief Processes enabled user clip distances. Loads the active clip
1511 /// distances from the PA, sets up barycentric equations, and
1512 /// stores the results to the output buffer
1513 /// @param pa - Primitive Assembly state
1514 /// @param primIndex - primitive index to process
1515 /// @param clipDistMask - mask of enabled clip distances
1516 /// @param pUserClipBuffer - buffer to store results
1517 template<uint32_t NumVerts>
1518 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1519 {
1520 DWORD clipDist;
1521 while (_BitScanForward(&clipDist, clipDistMask))
1522 {
1523 clipDistMask &= ~(1 << clipDist);
1524 uint32_t clipSlot = clipDist >> 2;
1525 uint32_t clipComp = clipDist & 0x3;
1526 uint32_t clipAttribSlot = clipSlot == 0 ?
1527 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1528
1529 __m128 primClipDist[3];
1530 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1531
1532 float vertClipDist[NumVerts];
1533 for (uint32_t e = 0; e < NumVerts; ++e)
1534 {
1535 OSALIGNSIMD(float) aVertClipDist[4];
1536 _mm_store_ps(aVertClipDist, primClipDist[e]);
1537 vertClipDist[e] = aVertClipDist[clipComp];
1538 };
1539
1540 // setup plane equations for barycentric interpolation in the backend
1541 float baryCoeff[NumVerts];
1542 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1543 {
1544 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1545 }
1546 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1547
1548 for (uint32_t e = 0; e < NumVerts; ++e)
1549 {
1550 *(pUserClipBuffer++) = baryCoeff[e];
1551 }
1552 }
1553 }
1554
1555 //////////////////////////////////////////////////////////////////////////
1556 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1557 /// culling, viewport transform, etc.
1558 /// @param pDC - pointer to draw context.
1559 /// @param pa - The primitive assembly object.
1560 /// @param workerId - thread's worker id. Even thread has a unique id.
1561 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1562 /// @param primID - Primitive ID for each triangle.
1563 void BinTriangles(
1564 DRAW_CONTEXT *pDC,
1565 PA_STATE& pa,
1566 uint32_t workerId,
1567 simdvector tri[3],
1568 uint32_t triMask,
1569 simdscalari primID)
1570 {
1571 RDTSC_START(FEBinTriangles);
1572
1573 const API_STATE& state = GetApiState(pDC);
1574 const SWR_RASTSTATE& rastState = state.rastState;
1575 const SWR_FRONTEND_STATE& feState = state.frontendState;
1576 const SWR_GS_STATE& gsState = state.gsState;
1577
1578 // Simple wireframe mode for debugging purposes only
1579
1580 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1581 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1582 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1583
1584 if (!feState.vpTransformDisable)
1585 {
1586 // perspective divide
1587 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1588 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1589 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1590
1591 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1592 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1593 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1594
1595 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1596 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1597 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1598
1599 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1600 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1601 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1602
1603 // viewport transform to screen coords
1604 viewportTransform<3>(tri, state.vpMatrix[0]);
1605 }
1606
1607 // adjust for pixel center location
1608 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1609 tri[0].x = _simd_add_ps(tri[0].x, offset);
1610 tri[0].y = _simd_add_ps(tri[0].y, offset);
1611
1612 tri[1].x = _simd_add_ps(tri[1].x, offset);
1613 tri[1].y = _simd_add_ps(tri[1].y, offset);
1614
1615 tri[2].x = _simd_add_ps(tri[2].x, offset);
1616 tri[2].y = _simd_add_ps(tri[2].y, offset);
1617
1618 // convert to fixed point
1619 simdscalari vXi[3], vYi[3];
1620 vXi[0] = fpToFixedPointVertical(tri[0].x);
1621 vYi[0] = fpToFixedPointVertical(tri[0].y);
1622 vXi[1] = fpToFixedPointVertical(tri[1].x);
1623 vYi[1] = fpToFixedPointVertical(tri[1].y);
1624 vXi[2] = fpToFixedPointVertical(tri[2].x);
1625 vYi[2] = fpToFixedPointVertical(tri[2].y);
1626
1627 // triangle setup
1628 simdscalari vAi[3], vBi[3];
1629 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1630
1631 // determinant
1632 simdscalari vDet[2];
1633 calcDeterminantIntVertical(vAi, vBi, vDet);
1634
1635 // cull zero area
1636 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1637 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1638
1639 int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2));
1640
1641 uint32_t origTriMask = triMask;
1642 triMask &= ~cullZeroAreaMask;
1643
1644 // determine front winding tris
1645 // CW +det
1646 // CCW -det
1647 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1648 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1649 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1650
1651 uint32_t frontWindingTris;
1652 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1653 {
1654 frontWindingTris = cwTriMask;
1655 }
1656 else
1657 {
1658 frontWindingTris = ~cwTriMask;
1659 }
1660
1661 // cull
1662 uint32_t cullTris;
1663 switch ((SWR_CULLMODE)rastState.cullMode)
1664 {
1665 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1666 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1667 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1668 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1669 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1670 }
1671
1672 triMask &= ~cullTris;
1673
1674 if (origTriMask ^ triMask)
1675 {
1676 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1677 }
1678
1679 // compute per tri backface
1680 uint32_t frontFaceMask = frontWindingTris;
1681
1682 uint32_t *pPrimID = (uint32_t *)&primID;
1683 DWORD triIndex = 0;
1684
1685 if (!triMask)
1686 {
1687 goto endBinTriangles;
1688 }
1689
1690 // Calc bounding box of triangles
1691 simdBBox bbox;
1692 calcBoundingBoxIntVertical(vXi, vYi, bbox);
1693
1694 // determine if triangle falls between pixel centers and discard
1695 // only discard for non-MSAA case
1696 // (left + 127) & ~255
1697 // (right + 128) & ~255
1698
1699 if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
1700 {
1701 origTriMask = triMask;
1702
1703 int cullCenterMask;
1704 {
1705 simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1706 left = _simd_and_si(left, _simd_set1_epi32(~255));
1707 simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1708 right = _simd_and_si(right, _simd_set1_epi32(~255));
1709
1710 simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1711
1712 simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1713 top = _simd_and_si(top, _simd_set1_epi32(~255));
1714 simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1715 bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1716
1717 simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1718 vMaskV = _simd_or_si(vMaskH, vMaskV);
1719 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1720 }
1721
1722 triMask &= ~cullCenterMask;
1723
1724 if(origTriMask ^ triMask)
1725 {
1726 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1727 }
1728 }
1729
1730 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1731 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1732 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1733 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1734 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1735
1736 // Cull tris completely outside scissor
1737 {
1738 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1739 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1740 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1741 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1742 triMask = triMask & ~maskOutsideScissor;
1743 }
1744
1745 if (!triMask)
1746 {
1747 goto endBinTriangles;
1748 }
1749
1750 // Convert triangle bbox to macrotile units.
1751 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1752 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1753 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1754 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1755
1756 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1757 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
1758 _simd_store_si((simdscalari*)aMTRight, bbox.right);
1759 _simd_store_si((simdscalari*)aMTTop, bbox.top);
1760 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
1761
1762 // transpose verts needed for backend
1763 /// @todo modify BE to take non-transformed verts
1764 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1765 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1766 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1767 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1768 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1769
1770 // store render target array index
1771 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1772 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1773 {
1774 simdvector vRtai[3];
1775 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1776 simdscalari vRtaii;
1777 vRtaii = _simd_castps_si(vRtai[0].x);
1778 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1779 }
1780 else
1781 {
1782 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1783 }
1784
1785 // scan remaining valid triangles and bin each separately
1786 while (_BitScanForward(&triIndex, triMask))
1787 {
1788 uint32_t linkageCount = state.linkageCount;
1789 uint32_t linkageMask = state.linkageMask;
1790 uint32_t numScalarAttribs = linkageCount * 4;
1791
1792 BE_WORK work;
1793 work.type = DRAW;
1794
1795 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1796
1797 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1798 desc.triFlags.primID = pPrimID[triIndex];
1799 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1800
1801 if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
1802 {
1803 work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
1804 }
1805 else
1806 {
1807 // for center sample pattern, all samples are at pixel center; calculate coverage
1808 // once at center and broadcast the results in the backend
1809 work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
1810 }
1811
1812 auto pArena = pDC->pArena;
1813 SWR_ASSERT(pArena != nullptr);
1814
1815 // store active attribs
1816 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1817 desc.pAttribs = pAttribs;
1818 desc.numAttribs = linkageCount;
1819 ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
1820
1821 // store triangle vertex data
1822 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1823
1824 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1825 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1826 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1827 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1828
1829 // store user clip distances
1830 if (rastState.clipDistanceMask)
1831 {
1832 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1833 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1834 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
1835 }
1836
1837 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1838 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1839 {
1840 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1841 {
1842 #if KNOB_ENABLE_TOSS_POINTS
1843 if (!KNOB_TOSS_SETUP_TRIS)
1844 #endif
1845 {
1846 pTileMgr->enqueue(x, y, &work);
1847 }
1848 }
1849 }
1850
1851 triMask &= ~(1 << triIndex);
1852 }
1853
1854 endBinTriangles:
1855 RDTSC_STOP(FEBinTriangles, 1, 0);
1856 }
1857
1858
1859
1860 //////////////////////////////////////////////////////////////////////////
1861 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1862 /// @param pDC - pointer to draw context.
1863 /// @param pa - The primitive assembly object.
1864 /// @param workerId - thread's worker id. Even thread has a unique id.
1865 /// @param tri - Contains point position data for SIMDs worth of points.
1866 /// @param primID - Primitive ID for each point.
1867 void BinPoints(
1868 DRAW_CONTEXT *pDC,
1869 PA_STATE& pa,
1870 uint32_t workerId,
1871 simdvector prim[3],
1872 uint32_t primMask,
1873 simdscalari primID)
1874 {
1875 RDTSC_START(FEBinPoints);
1876
1877 simdvector& primVerts = prim[0];
1878
1879 const API_STATE& state = GetApiState(pDC);
1880 const SWR_FRONTEND_STATE& feState = state.frontendState;
1881 const SWR_GS_STATE& gsState = state.gsState;
1882 const SWR_RASTSTATE& rastState = state.rastState;
1883
1884 if (!feState.vpTransformDisable)
1885 {
1886 // perspective divide
1887 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1888 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1889 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1890 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1891
1892 // viewport transform to screen coords
1893 viewportTransform<1>(&primVerts, state.vpMatrix[0]);
1894 }
1895
1896 // adjust for pixel center location
1897 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1898 primVerts.x = _simd_add_ps(primVerts.x, offset);
1899 primVerts.y = _simd_add_ps(primVerts.y, offset);
1900
1901 // convert to fixed point
1902 simdscalari vXi, vYi;
1903 vXi = fpToFixedPointVertical(primVerts.x);
1904 vYi = fpToFixedPointVertical(primVerts.y);
1905
1906 if (CanUseSimplePoints(pDC))
1907 {
1908 // adjust for top-left rule
1909 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1910 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1911
1912 // cull points off the top-left edge of the viewport
1913 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1914 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1915
1916 // compute macro tile coordinates
1917 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1918 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1919
1920 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1921 _simd_store_si((simdscalari*)aMacroX, macroX);
1922 _simd_store_si((simdscalari*)aMacroY, macroY);
1923
1924 // compute raster tile coordinates
1925 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1926 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1927
1928 // compute raster tile relative x,y for coverage mask
1929 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1930 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1931
1932 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1933 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1934
1935 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1936 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1937 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1938 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1939
1940 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1941 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1942 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1943 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1944
1945 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1946 _simd_store_ps((float*)aZ, primVerts.z);
1947
1948 // store render target array index
1949 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1950 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1951 {
1952 simdvector vRtai;
1953 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
1954 simdscalari vRtaii = _simd_castps_si(vRtai.x);
1955 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1956 }
1957 else
1958 {
1959 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1960 }
1961
1962 uint32_t *pPrimID = (uint32_t *)&primID;
1963 DWORD primIndex = 0;
1964 // scan remaining valid triangles and bin each separately
1965 while (_BitScanForward(&primIndex, primMask))
1966 {
1967 uint32_t linkageCount = state.linkageCount;
1968 uint32_t linkageMask = state.linkageMask;
1969
1970 uint32_t numScalarAttribs = linkageCount * 4;
1971
1972 BE_WORK work;
1973 work.type = DRAW;
1974
1975 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1976
1977 // points are always front facing
1978 desc.triFlags.frontFacing = 1;
1979 desc.triFlags.primID = pPrimID[primIndex];
1980 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1981
1982 work.pfnWork = RasterizeSimplePoint;
1983
1984 auto pArena = pDC->pArena;
1985 SWR_ASSERT(pArena != nullptr);
1986
1987 // store attributes
1988 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1989 desc.pAttribs = pAttribs;
1990 desc.numAttribs = linkageCount;
1991
1992 ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
1993
1994 // store raster tile aligned x, y, perspective correct z
1995 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1996 desc.pTriBuffer = pTriBuffer;
1997 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1998 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1999 *pTriBuffer = aZ[primIndex];
2000
2001 uint32_t tX = aTileRelativeX[primIndex];
2002 uint32_t tY = aTileRelativeY[primIndex];
2003
2004 // pack the relative x,y into the coverageMask, the rasterizer will
2005 // generate the true coverage mask from it
2006 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2007
2008 // bin it
2009 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2010 #if KNOB_ENABLE_TOSS_POINTS
2011 if (!KNOB_TOSS_SETUP_TRIS)
2012 #endif
2013 {
2014 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2015 }
2016 primMask &= ~(1 << primIndex);
2017 }
2018 }
2019 else
2020 {
2021 // non simple points need to be potentially binned to multiple macro tiles
2022 simdscalar vPointSize;
2023 if (rastState.pointParam)
2024 {
2025 simdvector size[3];
2026 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2027 vPointSize = size[0].x;
2028 }
2029 else
2030 {
2031 vPointSize = _simd_set1_ps(rastState.pointSize);
2032 }
2033
2034 // bloat point to bbox
2035 simdBBox bbox;
2036 bbox.left = bbox.right = vXi;
2037 bbox.top = bbox.bottom = vYi;
2038
2039 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2040 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2041 bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2042 bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2043 bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2044 bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2045
2046 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2047 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2048 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2049 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2050 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2051
2052 // Cull bloated points completely outside scissor
2053 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2054 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2055 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2056 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2057 primMask = primMask & ~maskOutsideScissor;
2058
2059 // Convert bbox to macrotile units.
2060 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2061 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2062 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2063 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2064
2065 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2066 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2067 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2068 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2069 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2070
2071 // store render target array index
2072 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2073 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2074 {
2075 simdvector vRtai[2];
2076 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2077 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2078 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2079 }
2080 else
2081 {
2082 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2083 }
2084
2085 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2086 _simd_store_ps((float*)aPointSize, vPointSize);
2087
2088 uint32_t *pPrimID = (uint32_t *)&primID;
2089
2090 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2091 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2092 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2093
2094 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2095 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2096 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2097
2098 // scan remaining valid prims and bin each separately
2099 DWORD primIndex;
2100 while (_BitScanForward(&primIndex, primMask))
2101 {
2102 uint32_t linkageCount = state.linkageCount;
2103 uint32_t linkageMask = state.linkageMask;
2104 uint32_t numScalarAttribs = linkageCount * 4;
2105
2106 BE_WORK work;
2107 work.type = DRAW;
2108
2109 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2110
2111 desc.triFlags.frontFacing = 1;
2112 desc.triFlags.primID = pPrimID[primIndex];
2113 desc.triFlags.pointSize = aPointSize[primIndex];
2114 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2115
2116 work.pfnWork = RasterizeTriPoint;
2117
2118 auto pArena = pDC->pArena;
2119 SWR_ASSERT(pArena != nullptr);
2120
2121 // store active attribs
2122 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2123 desc.numAttribs = linkageCount;
2124 ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2125
2126 // store point vertex data
2127 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2128 desc.pTriBuffer = pTriBuffer;
2129 *pTriBuffer++ = aPrimVertsX[primIndex];
2130 *pTriBuffer++ = aPrimVertsY[primIndex];
2131 *pTriBuffer = aPrimVertsZ[primIndex];
2132
2133 // store user clip distances
2134 if (rastState.clipDistanceMask)
2135 {
2136 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2137 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2138 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2139 }
2140
2141 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2142 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2143 {
2144 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2145 {
2146 #if KNOB_ENABLE_TOSS_POINTS
2147 if (!KNOB_TOSS_SETUP_TRIS)
2148 #endif
2149 {
2150 pTileMgr->enqueue(x, y, &work);
2151 }
2152 }
2153 }
2154
2155 primMask &= ~(1 << primIndex);
2156 }
2157 }
2158
2159
2160
2161
2162 RDTSC_STOP(FEBinPoints, 1, 0);
2163 }
2164
2165 //////////////////////////////////////////////////////////////////////////
2166 /// @brief Bin SIMD lines to the backend.
2167 /// @param pDC - pointer to draw context.
2168 /// @param pa - The primitive assembly object.
2169 /// @param workerId - thread's worker id. Even thread has a unique id.
2170 /// @param tri - Contains line position data for SIMDs worth of points.
2171 /// @param primID - Primitive ID for each line.
2172 void BinLines(
2173 DRAW_CONTEXT *pDC,
2174 PA_STATE& pa,
2175 uint32_t workerId,
2176 simdvector prim[],
2177 uint32_t primMask,
2178 simdscalari primID)
2179 {
2180 RDTSC_START(FEBinLines);
2181
2182 const API_STATE& state = GetApiState(pDC);
2183 const SWR_RASTSTATE& rastState = state.rastState;
2184 const SWR_FRONTEND_STATE& feState = state.frontendState;
2185 const SWR_GS_STATE& gsState = state.gsState;
2186
2187 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2188 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2189
2190 if (!feState.vpTransformDisable)
2191 {
2192 // perspective divide
2193 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2194 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2195
2196 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2197 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2198
2199 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2200 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2201
2202 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2203 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2204
2205 // viewport transform to screen coords
2206 viewportTransform<2>(prim, state.vpMatrix[0]);
2207 }
2208
2209 // adjust for pixel center location
2210 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2211 prim[0].x = _simd_add_ps(prim[0].x, offset);
2212 prim[0].y = _simd_add_ps(prim[0].y, offset);
2213
2214 prim[1].x = _simd_add_ps(prim[1].x, offset);
2215 prim[1].y = _simd_add_ps(prim[1].y, offset);
2216
2217 // convert to fixed point
2218 simdscalari vXi[2], vYi[2];
2219 vXi[0] = fpToFixedPointVertical(prim[0].x);
2220 vYi[0] = fpToFixedPointVertical(prim[0].y);
2221 vXi[1] = fpToFixedPointVertical(prim[1].x);
2222 vYi[1] = fpToFixedPointVertical(prim[1].y);
2223
2224 // compute x-major vs y-major mask
2225 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2226 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2227 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2228 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2229
2230 // cull zero-length lines
2231 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2232 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2233
2234 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2235
2236 uint32_t *pPrimID = (uint32_t *)&primID;
2237
2238 simdscalar vUnused = _simd_setzero_ps();
2239
2240 // Calc bounding box of lines
2241 simdBBox bbox;
2242 bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2243 bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2244 bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2245 bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2246
2247 // bloat bbox by line width along minor axis
2248 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2249 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2250 simdBBox bloatBox;
2251 bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2252 bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2253 bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2254 bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2255
2256 bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2257 bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2258 bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2259 bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2260
2261 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2262 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2263 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2264 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2265 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2266
2267 // Cull prims completely outside scissor
2268 {
2269 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2270 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2271 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2272 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2273 primMask = primMask & ~maskOutsideScissor;
2274 }
2275
2276 if (!primMask)
2277 {
2278 goto endBinLines;
2279 }
2280
2281 // Convert triangle bbox to macrotile units.
2282 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2283 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2284 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2285 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2286
2287 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2288 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2289 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2290 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2291 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2292
2293 // transpose verts needed for backend
2294 /// @todo modify BE to take non-transformed verts
2295 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2296 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2297 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2298 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2299 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2300
2301 // store render target array index
2302 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2303 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2304 {
2305 simdvector vRtai[2];
2306 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2307 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2308 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2309 }
2310 else
2311 {
2312 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2313 }
2314
2315 // scan remaining valid prims and bin each separately
2316 DWORD primIndex;
2317 while (_BitScanForward(&primIndex, primMask))
2318 {
2319 uint32_t linkageCount = state.linkageCount;
2320 uint32_t linkageMask = state.linkageMask;
2321 uint32_t numScalarAttribs = linkageCount * 4;
2322
2323 BE_WORK work;
2324 work.type = DRAW;
2325
2326 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2327
2328 desc.triFlags.frontFacing = 1;
2329 desc.triFlags.primID = pPrimID[primIndex];
2330 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2331 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2332
2333 work.pfnWork = RasterizeLine;
2334
2335 auto pArena = pDC->pArena;
2336 SWR_ASSERT(pArena != nullptr);
2337
2338 // store active attribs
2339 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2340 desc.numAttribs = linkageCount;
2341 ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2342
2343 // store line vertex data
2344 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2345 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2346 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2347 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2348 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2349
2350 // store user clip distances
2351 if (rastState.clipDistanceMask)
2352 {
2353 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2354 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2355 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2356 }
2357
2358 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2359 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2360 {
2361 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2362 {
2363 #if KNOB_ENABLE_TOSS_POINTS
2364 if (!KNOB_TOSS_SETUP_TRIS)
2365 #endif
2366 {
2367 pTileMgr->enqueue(x, y, &work);
2368 }
2369 }
2370 }
2371
2372 primMask &= ~(1 << primIndex);
2373 }
2374
2375 endBinLines:
2376
2377 RDTSC_STOP(FEBinLines, 1, 0);
2378 }