swr: [rasterizer core] conservative rast backend changes
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
37 #include "utils.h"
38 #include "threads.h"
39 #include "pa.h"
40 #include "clip.h"
41 #include "tilemgr.h"
42 #include "tessellator.h"
43 #include <limits>
44
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE uint32_t GenMask(uint32_t numBits)
48 {
49 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
50 return ((1U << numBits) - 1);
51 }
52
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
55 /// raster state.
56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
69 void ProcessSync(
70 SWR_CONTEXT *pContext,
71 DRAW_CONTEXT *pDC,
72 uint32_t workerId,
73 void *pUserData)
74 {
75 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
76 BE_WORK work;
77 work.type = SYNC;
78 work.pfnWork = ProcessSyncBE;
79 work.desc.sync = *pSync;
80
81 MacroTileMgr *pTileMgr = pDC->pTileMgr;
82 pTileMgr->enqueue(0, 0, &work);
83 }
84
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief FE handler for SwrGetStats.
87 /// @param pContext - pointer to SWR context.
88 /// @param pDC - pointer to draw context.
89 /// @param workerId - thread's worker id. Even thread has a unique id.
90 /// @param pUserData - Pointer to user data passed back to stats callback.
91 /// @todo This should go away when we switch this to use compute threading.
92 void ProcessQueryStats(
93 SWR_CONTEXT *pContext,
94 DRAW_CONTEXT *pDC,
95 uint32_t workerId,
96 void *pUserData)
97 {
98 QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
99 BE_WORK work;
100 work.type = QUERYSTATS;
101 work.pfnWork = ProcessQueryStatsBE;
102 work.desc.queryStats = *pQueryStats;
103
104 MacroTileMgr *pTileMgr = pDC->pTileMgr;
105 pTileMgr->enqueue(0, 0, &work);
106 }
107
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief FE handler for SwrClearRenderTarget.
110 /// @param pContext - pointer to SWR context.
111 /// @param pDC - pointer to draw context.
112 /// @param workerId - thread's worker id. Even thread has a unique id.
113 /// @param pUserData - Pointer to user data passed back to clear callback.
114 /// @todo This should go away when we switch this to use compute threading.
115 void ProcessClear(
116 SWR_CONTEXT *pContext,
117 DRAW_CONTEXT *pDC,
118 uint32_t workerId,
119 void *pUserData)
120 {
121 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
122 MacroTileMgr *pTileMgr = pDC->pTileMgr;
123
124 const API_STATE& state = GetApiState(pDC);
125
126 // queue a clear to each macro tile
127 // compute macro tile bounds for the current scissor/viewport
128 uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
129 uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
130 uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
131 uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
132
133 BE_WORK work;
134 work.type = CLEAR;
135 work.pfnWork = ProcessClearBE;
136 work.desc.clear = *pClear;
137
138 for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
139 {
140 for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
141 {
142 pTileMgr->enqueue(x, y, &work);
143 }
144 }
145 }
146
147 //////////////////////////////////////////////////////////////////////////
148 /// @brief FE handler for SwrStoreTiles.
149 /// @param pContext - pointer to SWR context.
150 /// @param pDC - pointer to draw context.
151 /// @param workerId - thread's worker id. Even thread has a unique id.
152 /// @param pUserData - Pointer to user data passed back to callback.
153 /// @todo This should go away when we switch this to use compute threading.
154 void ProcessStoreTiles(
155 SWR_CONTEXT *pContext,
156 DRAW_CONTEXT *pDC,
157 uint32_t workerId,
158 void *pUserData)
159 {
160 RDTSC_START(FEProcessStoreTiles);
161 STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
162 MacroTileMgr *pTileMgr = pDC->pTileMgr;
163
164 const API_STATE& state = GetApiState(pDC);
165
166 // queue a store to each macro tile
167 // compute macro tile bounds for the current render target
168 const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
169 const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
170
171 uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
172 uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
173
174 // store tiles
175 BE_WORK work;
176 work.type = STORETILES;
177 work.pfnWork = ProcessStoreTileBE;
178 work.desc.storeTiles = *pStore;
179
180 for (uint32_t x = 0; x < numMacroTilesX; ++x)
181 {
182 for (uint32_t y = 0; y < numMacroTilesY; ++y)
183 {
184 pTileMgr->enqueue(x, y, &work);
185 }
186 }
187
188 RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
189 }
190
191 //////////////////////////////////////////////////////////////////////////
192 /// @brief FE handler for SwrInvalidateTiles.
193 /// @param pContext - pointer to SWR context.
194 /// @param pDC - pointer to draw context.
195 /// @param workerId - thread's worker id. Even thread has a unique id.
196 /// @param pUserData - Pointer to user data passed back to callback.
197 /// @todo This should go away when we switch this to use compute threading.
198 void ProcessDiscardInvalidateTiles(
199 SWR_CONTEXT *pContext,
200 DRAW_CONTEXT *pDC,
201 uint32_t workerId,
202 void *pUserData)
203 {
204 RDTSC_START(FEProcessInvalidateTiles);
205 DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
206 MacroTileMgr *pTileMgr = pDC->pTileMgr;
207
208 SWR_RECT rect;
209
210 if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
211 {
212 // Valid rect
213 rect = pInv->rect;
214 }
215 else
216 {
217 // Use viewport dimensions
218 const API_STATE& state = GetApiState(pDC);
219
220 rect.left = (uint32_t)state.vp[0].x;
221 rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
222 rect.top = (uint32_t)state.vp[0].y;
223 rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
224 }
225
226 // queue a store to each macro tile
227 // compute macro tile bounds for the current render target
228 uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
229 uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
230
231 // Setup region assuming full tiles
232 uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
233 uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
234
235 uint32_t macroTileEndX = rect.right / macroWidth;
236 uint32_t macroTileEndY = rect.bottom / macroHeight;
237
238 if (pInv->fullTilesOnly == false)
239 {
240 // include partial tiles
241 macroTileStartX = rect.left / macroWidth;
242 macroTileStartY = rect.top / macroHeight;
243
244 macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
245 macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
246 }
247
248 SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
249 SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
250
251 macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
252 macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
253
254 // load tiles
255 BE_WORK work;
256 work.type = DISCARDINVALIDATETILES;
257 work.pfnWork = ProcessDiscardInvalidateTilesBE;
258 work.desc.discardInvalidateTiles = *pInv;
259
260 for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
261 {
262 for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
263 {
264 pTileMgr->enqueue(x, y, &work);
265 }
266 }
267
268 RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
269 }
270
271 //////////////////////////////////////////////////////////////////////////
272 /// @brief Computes the number of primitives given the number of verts.
273 /// @param mode - primitive topology for draw operation.
274 /// @param numPrims - number of vertices or indices for draw.
275 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
276 uint32_t GetNumPrims(
277 PRIMITIVE_TOPOLOGY mode,
278 uint32_t numPrims)
279 {
280 switch (mode)
281 {
282 case TOP_POINT_LIST: return numPrims;
283 case TOP_TRIANGLE_LIST: return numPrims / 3;
284 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
285 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
286 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
287 case TOP_QUAD_LIST: return numPrims / 4;
288 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
289 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
290 case TOP_LINE_LIST: return numPrims / 2;
291 case TOP_LINE_LOOP: return numPrims;
292 case TOP_RECT_LIST: return numPrims / 3;
293 case TOP_LINE_LIST_ADJ: return numPrims / 4;
294 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
295 case TOP_TRI_LIST_ADJ: return numPrims / 6;
296 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
297
298 case TOP_PATCHLIST_1:
299 case TOP_PATCHLIST_2:
300 case TOP_PATCHLIST_3:
301 case TOP_PATCHLIST_4:
302 case TOP_PATCHLIST_5:
303 case TOP_PATCHLIST_6:
304 case TOP_PATCHLIST_7:
305 case TOP_PATCHLIST_8:
306 case TOP_PATCHLIST_9:
307 case TOP_PATCHLIST_10:
308 case TOP_PATCHLIST_11:
309 case TOP_PATCHLIST_12:
310 case TOP_PATCHLIST_13:
311 case TOP_PATCHLIST_14:
312 case TOP_PATCHLIST_15:
313 case TOP_PATCHLIST_16:
314 case TOP_PATCHLIST_17:
315 case TOP_PATCHLIST_18:
316 case TOP_PATCHLIST_19:
317 case TOP_PATCHLIST_20:
318 case TOP_PATCHLIST_21:
319 case TOP_PATCHLIST_22:
320 case TOP_PATCHLIST_23:
321 case TOP_PATCHLIST_24:
322 case TOP_PATCHLIST_25:
323 case TOP_PATCHLIST_26:
324 case TOP_PATCHLIST_27:
325 case TOP_PATCHLIST_28:
326 case TOP_PATCHLIST_29:
327 case TOP_PATCHLIST_30:
328 case TOP_PATCHLIST_31:
329 case TOP_PATCHLIST_32:
330 return numPrims / (mode - TOP_PATCHLIST_BASE);
331
332 case TOP_POLYGON:
333 case TOP_POINT_LIST_BF:
334 case TOP_LINE_STRIP_CONT:
335 case TOP_LINE_STRIP_BF:
336 case TOP_LINE_STRIP_CONT_BF:
337 case TOP_TRIANGLE_FAN_NOSTIPPLE:
338 case TOP_TRI_STRIP_REVERSE:
339 case TOP_PATCHLIST_BASE:
340 case TOP_UNKNOWN:
341 SWR_ASSERT(false, "Unsupported topology: %d", mode);
342 return 0;
343 }
344
345 return 0;
346 }
347
348 //////////////////////////////////////////////////////////////////////////
349 /// @brief Computes the number of verts given the number of primitives.
350 /// @param mode - primitive topology for draw operation.
351 /// @param numPrims - number of primitives for draw.
352 uint32_t GetNumVerts(
353 PRIMITIVE_TOPOLOGY mode,
354 uint32_t numPrims)
355 {
356 switch (mode)
357 {
358 case TOP_POINT_LIST: return numPrims;
359 case TOP_TRIANGLE_LIST: return numPrims * 3;
360 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
361 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
362 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
363 case TOP_QUAD_LIST: return numPrims * 4;
364 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
365 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
366 case TOP_LINE_LIST: return numPrims * 2;
367 case TOP_LINE_LOOP: return numPrims;
368 case TOP_RECT_LIST: return numPrims * 3;
369 case TOP_LINE_LIST_ADJ: return numPrims * 4;
370 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
371 case TOP_TRI_LIST_ADJ: return numPrims * 6;
372 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
373
374 case TOP_PATCHLIST_1:
375 case TOP_PATCHLIST_2:
376 case TOP_PATCHLIST_3:
377 case TOP_PATCHLIST_4:
378 case TOP_PATCHLIST_5:
379 case TOP_PATCHLIST_6:
380 case TOP_PATCHLIST_7:
381 case TOP_PATCHLIST_8:
382 case TOP_PATCHLIST_9:
383 case TOP_PATCHLIST_10:
384 case TOP_PATCHLIST_11:
385 case TOP_PATCHLIST_12:
386 case TOP_PATCHLIST_13:
387 case TOP_PATCHLIST_14:
388 case TOP_PATCHLIST_15:
389 case TOP_PATCHLIST_16:
390 case TOP_PATCHLIST_17:
391 case TOP_PATCHLIST_18:
392 case TOP_PATCHLIST_19:
393 case TOP_PATCHLIST_20:
394 case TOP_PATCHLIST_21:
395 case TOP_PATCHLIST_22:
396 case TOP_PATCHLIST_23:
397 case TOP_PATCHLIST_24:
398 case TOP_PATCHLIST_25:
399 case TOP_PATCHLIST_26:
400 case TOP_PATCHLIST_27:
401 case TOP_PATCHLIST_28:
402 case TOP_PATCHLIST_29:
403 case TOP_PATCHLIST_30:
404 case TOP_PATCHLIST_31:
405 case TOP_PATCHLIST_32:
406 return numPrims * (mode - TOP_PATCHLIST_BASE);
407
408 case TOP_POLYGON:
409 case TOP_POINT_LIST_BF:
410 case TOP_LINE_STRIP_CONT:
411 case TOP_LINE_STRIP_BF:
412 case TOP_LINE_STRIP_CONT_BF:
413 case TOP_TRIANGLE_FAN_NOSTIPPLE:
414 case TOP_TRI_STRIP_REVERSE:
415 case TOP_PATCHLIST_BASE:
416 case TOP_UNKNOWN:
417 SWR_ASSERT(false, "Unsupported topology: %d", mode);
418 return 0;
419 }
420
421 return 0;
422 }
423
424 //////////////////////////////////////////////////////////////////////////
425 /// @brief Return number of verts per primitive.
426 /// @param topology - topology
427 /// @param includeAdjVerts - include adjacent verts in primitive vertices
428 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
429 {
430 uint32_t numVerts = 0;
431 switch (topology)
432 {
433 case TOP_POINT_LIST:
434 case TOP_POINT_LIST_BF:
435 numVerts = 1;
436 break;
437 case TOP_LINE_LIST:
438 case TOP_LINE_STRIP:
439 case TOP_LINE_LIST_ADJ:
440 case TOP_LINE_LOOP:
441 case TOP_LINE_STRIP_CONT:
442 case TOP_LINE_STRIP_BF:
443 case TOP_LISTSTRIP_ADJ:
444 numVerts = 2;
445 break;
446 case TOP_TRIANGLE_LIST:
447 case TOP_TRIANGLE_STRIP:
448 case TOP_TRIANGLE_FAN:
449 case TOP_TRI_LIST_ADJ:
450 case TOP_TRI_STRIP_ADJ:
451 case TOP_TRI_STRIP_REVERSE:
452 case TOP_RECT_LIST:
453 numVerts = 3;
454 break;
455 case TOP_QUAD_LIST:
456 case TOP_QUAD_STRIP:
457 numVerts = 4;
458 break;
459 case TOP_PATCHLIST_1:
460 case TOP_PATCHLIST_2:
461 case TOP_PATCHLIST_3:
462 case TOP_PATCHLIST_4:
463 case TOP_PATCHLIST_5:
464 case TOP_PATCHLIST_6:
465 case TOP_PATCHLIST_7:
466 case TOP_PATCHLIST_8:
467 case TOP_PATCHLIST_9:
468 case TOP_PATCHLIST_10:
469 case TOP_PATCHLIST_11:
470 case TOP_PATCHLIST_12:
471 case TOP_PATCHLIST_13:
472 case TOP_PATCHLIST_14:
473 case TOP_PATCHLIST_15:
474 case TOP_PATCHLIST_16:
475 case TOP_PATCHLIST_17:
476 case TOP_PATCHLIST_18:
477 case TOP_PATCHLIST_19:
478 case TOP_PATCHLIST_20:
479 case TOP_PATCHLIST_21:
480 case TOP_PATCHLIST_22:
481 case TOP_PATCHLIST_23:
482 case TOP_PATCHLIST_24:
483 case TOP_PATCHLIST_25:
484 case TOP_PATCHLIST_26:
485 case TOP_PATCHLIST_27:
486 case TOP_PATCHLIST_28:
487 case TOP_PATCHLIST_29:
488 case TOP_PATCHLIST_30:
489 case TOP_PATCHLIST_31:
490 case TOP_PATCHLIST_32:
491 numVerts = topology - TOP_PATCHLIST_BASE;
492 break;
493 default:
494 SWR_ASSERT(false, "Unsupported topology: %d", topology);
495 break;
496 }
497
498 if (includeAdjVerts)
499 {
500 switch (topology)
501 {
502 case TOP_LISTSTRIP_ADJ:
503 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
504 case TOP_TRI_STRIP_ADJ:
505 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
506 default: break;
507 }
508 }
509
510 return numVerts;
511 }
512
513 //////////////////////////////////////////////////////////////////////////
514 /// @brief Generate mask from remaining work.
515 /// @param numWorkItems - Number of items being worked on by a SIMD.
516 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
517 {
518 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
519 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
520 return _simd_castps_si(vMask(mask));
521 }
522
523 //////////////////////////////////////////////////////////////////////////
524 /// @brief StreamOut - Streams vertex data out to SO buffers.
525 /// Generally, we are only streaming out a SIMDs worth of triangles.
526 /// @param pDC - pointer to draw context.
527 /// @param workerId - thread's worker id. Even thread has a unique id.
528 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
529 static void StreamOut(
530 DRAW_CONTEXT* pDC,
531 PA_STATE& pa,
532 uint32_t workerId,
533 uint32_t* pPrimData,
534 uint32_t streamIndex)
535 {
536 RDTSC_START(FEStreamout);
537
538 SWR_CONTEXT* pContext = pDC->pContext;
539
540 const API_STATE& state = GetApiState(pDC);
541 const SWR_STREAMOUT_STATE &soState = state.soState;
542
543 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
544
545 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
546 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
547
548 SWR_STREAMOUT_CONTEXT soContext = { 0 };
549
550 // Setup buffer state pointers.
551 for (uint32_t i = 0; i < 4; ++i)
552 {
553 soContext.pBuffer[i] = &state.soBuffer[i];
554 }
555
556 uint32_t numPrims = pa.NumPrims();
557 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
558 {
559 DWORD slot = 0;
560 uint32_t soMask = soState.streamMasks[streamIndex];
561
562 // Write all entries into primitive data buffer for SOS.
563 while (_BitScanForward(&slot, soMask))
564 {
565 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
566 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
567 pa.AssembleSingle(paSlot, primIndex, attrib);
568
569 // Attribute offset is relative offset from start of vertex.
570 // Note that attributes start at slot 1 in the PA buffer. We need to write this
571 // to prim data starting at slot 0. Which is why we do (slot - 1).
572 // Also note: GL works slightly differently, and needs slot 0
573 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
574
575 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
576 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
577 {
578 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
579
580 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
581 }
582 soMask &= ~(1 << slot);
583 }
584
585 // Update pPrimData pointer
586 soContext.pPrimData = pPrimData;
587
588 // Call SOS
589 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
590 state.pfnSoFunc[streamIndex](soContext);
591 }
592
593 // Update SO write offset. The driver provides memory for the update.
594 for (uint32_t i = 0; i < 4; ++i)
595 {
596 if (state.soBuffer[i].pWriteOffset)
597 {
598 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
599
600 // The SOS increments the existing write offset. So we don't want to increment
601 // the SoWriteOffset stat using an absolute offset instead of relative.
602 SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
603 }
604 }
605
606 UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
607 UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
608
609 RDTSC_STOP(FEStreamout, 1, 0);
610 }
611
612 //////////////////////////////////////////////////////////////////////////
613 /// @brief Computes number of invocations. The current index represents
614 /// the start of the SIMD. The max index represents how much work
615 /// items are remaining. If there is less then a SIMD's left of work
616 /// then return the remaining amount of work.
617 /// @param curIndex - The start index for the SIMD.
618 /// @param maxIndex - The last index for all work items.
619 static INLINE uint32_t GetNumInvocations(
620 uint32_t curIndex,
621 uint32_t maxIndex)
622 {
623 uint32_t remainder = (maxIndex - curIndex);
624 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
625 }
626
627 //////////////////////////////////////////////////////////////////////////
628 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
629 /// The geometry shader will loop over each active streamout buffer, assembling
630 /// primitives for the downstream stages. When multistream output is enabled,
631 /// the generated stream ID buffer from the GS needs to be converted to a cut
632 /// buffer for the primitive assembler.
633 /// @param stream - stream id to generate the cut buffer for
634 /// @param pStreamIdBase - pointer to the stream ID buffer
635 /// @param numEmittedVerts - Number of total verts emitted by the GS
636 /// @param pCutBuffer - output buffer to write cuts to
637 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
638 {
639 SWR_ASSERT(stream < MAX_SO_STREAMS);
640
641 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
642 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
643
644 for (uint32_t b = 0; b < numOutputBytes; ++b)
645 {
646 uint8_t curInputByte = pStreamIdBase[2*b];
647 uint8_t outByte = 0;
648 for (uint32_t i = 0; i < 4; ++i)
649 {
650 if ((curInputByte & 0x3) != stream)
651 {
652 outByte |= (1 << i);
653 }
654 curInputByte >>= 2;
655 }
656
657 curInputByte = pStreamIdBase[2 * b + 1];
658 for (uint32_t i = 0; i < 4; ++i)
659 {
660 if ((curInputByte & 0x3) != stream)
661 {
662 outByte |= (1 << (i + 4));
663 }
664 curInputByte >>= 2;
665 }
666
667 *pCutBuffer++ = outByte;
668 }
669 }
670
671 THREAD SWR_GS_CONTEXT tlsGsContext;
672
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Implements GS stage.
675 /// @param pDC - pointer to draw context.
676 /// @param workerId - thread's worker id. Even thread has a unique id.
677 /// @param pa - The primitive assembly object.
678 /// @param pGsOut - output stream for GS
679 template <
680 typename HasStreamOutT,
681 typename HasRastT>
682 static void GeometryShaderStage(
683 DRAW_CONTEXT *pDC,
684 uint32_t workerId,
685 PA_STATE& pa,
686 void* pGsOut,
687 void* pCutBuffer,
688 void* pStreamCutBuffer,
689 uint32_t* pSoPrimData,
690 simdscalari primID)
691 {
692 RDTSC_START(FEGeometryShader);
693
694 SWR_CONTEXT* pContext = pDC->pContext;
695
696 const API_STATE& state = GetApiState(pDC);
697 const SWR_GS_STATE* pState = &state.gsState;
698
699 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
700 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
701
702 tlsGsContext.pStream = (uint8_t*)pGsOut;
703 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
704 tlsGsContext.PrimitiveID = primID;
705
706 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
707 simdvector attrib[MAX_ATTRIBUTES];
708
709 // assemble all attributes for the input primitive
710 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
711 {
712 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
713 pa.Assemble(attribSlot, attrib);
714
715 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
716 {
717 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
718 }
719 }
720
721 // assemble position
722 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
723 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
724 {
725 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
726 }
727
728 const uint32_t vertexStride = sizeof(simdvertex);
729 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
730 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
731 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
732 uint32_t cutPrimStride;
733 uint32_t cutInstanceStride;
734
735 if (pState->isSingleStream)
736 {
737 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
738 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
739 }
740 else
741 {
742 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
743 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
744 }
745
746 // record valid prims from the frontend to avoid over binning the newly generated
747 // prims from the GS
748 uint32_t numInputPrims = pa.NumPrims();
749
750 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
751 {
752 tlsGsContext.InstanceID = instance;
753 tlsGsContext.mask = GenerateMask(numInputPrims);
754
755 // execute the geometry shader
756 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
757
758 tlsGsContext.pStream += instanceStride;
759 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
760 }
761
762 // set up new binner and state for the GS output topology
763 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
764 if (HasRastT::value)
765 {
766 switch (pState->outputTopology)
767 {
768 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
769 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
770 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
771 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
772 }
773 }
774
775 // foreach input prim:
776 // - setup a new PA based on the emitted verts for that prim
777 // - loop over the new verts, calling PA to assemble each prim
778 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
779 uint32_t* pPrimitiveId = (uint32_t*)&primID;
780
781 uint32_t totalPrimsGenerated = 0;
782 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
783 {
784 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
785 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
786 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
787 {
788 uint32_t numEmittedVerts = pVertexCount[inputPrim];
789 if (numEmittedVerts == 0)
790 {
791 continue;
792 }
793
794 uint8_t* pBase = pInstanceBase + instance * instanceStride;
795 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
796
797 DWORD numAttribs;
798 if (_BitScanReverse(&numAttribs, state.feAttribMask))
799 {
800 numAttribs++;
801 }
802 else
803 {
804 numAttribs = 0;
805 }
806
807 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
808 {
809 bool processCutVerts = false;
810
811 uint8_t* pCutBuffer = pCutBase;
812
813 // assign default stream ID, only relevant when GS is outputting a single stream
814 uint32_t streamID = 0;
815 if (pState->isSingleStream)
816 {
817 processCutVerts = true;
818 streamID = pState->singleStreamID;
819 if (streamID != stream) continue;
820 }
821 else
822 {
823 // early exit if this stream is not enabled for streamout
824 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
825 {
826 continue;
827 }
828
829 // multi-stream output, need to translate StreamID buffer to a cut buffer
830 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
831 pCutBuffer = (uint8_t*)pStreamCutBuffer;
832 processCutVerts = false;
833 }
834
835 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
836
837 while (gsPa.GetNextStreamOutput())
838 {
839 do
840 {
841 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
842
843 if (assemble)
844 {
845 totalPrimsGenerated += gsPa.NumPrims();
846
847 if (HasStreamOutT::value)
848 {
849 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
850 }
851
852 if (HasRastT::value && state.soState.streamToRasterizer == stream)
853 {
854 simdscalari vPrimId;
855 // pull primitiveID from the GS output if available
856 if (state.gsState.emitsPrimitiveID)
857 {
858 simdvector primIdAttrib[3];
859 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
860 vPrimId = _simd_castps_si(primIdAttrib[0].x);
861 }
862 else
863 {
864 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
865 }
866
867 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
868 }
869 }
870 } while (gsPa.NextPrim());
871 }
872 }
873 }
874 }
875
876 // update GS pipeline stats
877 UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
878 UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
879
880 RDTSC_STOP(FEGeometryShader, 1, 0);
881 }
882
883 //////////////////////////////////////////////////////////////////////////
884 /// @brief Allocate GS buffers
885 /// @param pDC - pointer to draw context.
886 /// @param state - API state
887 /// @param ppGsOut - pointer to GS output buffer allocation
888 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
889 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
890 void **ppStreamCutBuffer)
891 {
892 auto pArena = pDC->pArena;
893 SWR_ASSERT(pArena != nullptr);
894 SWR_ASSERT(state.gsState.gsEnable);
895 // allocate arena space to hold GS output verts
896 // @todo pack attribs
897 // @todo support multiple streams
898 const uint32_t vertexStride = sizeof(simdvertex);
899 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
900 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
901 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
902
903 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
904 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
905 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
906 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
907
908 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
909 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
910
911 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
912 if (state.gsState.isSingleStream)
913 {
914 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
915 *ppStreamCutBuffer = nullptr;
916 }
917 else
918 {
919 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
920 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
921 }
922
923 }
924
925 //////////////////////////////////////////////////////////////////////////
926 /// @brief Contains all data generated by the HS and passed to the
927 /// tessellator and DS.
928 struct TessellationThreadLocalData
929 {
930 SWR_HS_CONTEXT hsContext;
931 ScalarPatch patchData[KNOB_SIMD_WIDTH];
932 void* pTxCtx;
933 size_t tsCtxSize;
934
935 simdscalar* pDSOutput;
936 size_t numDSOutputVectors;
937 };
938
939 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
940
941 //////////////////////////////////////////////////////////////////////////
942 /// @brief Allocate tessellation data for this worker thread.
943 INLINE
944 static void AllocateTessellationData(SWR_CONTEXT* pContext)
945 {
946 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
947 if (gt_pTessellationThreadData == nullptr)
948 {
949 gt_pTessellationThreadData = (TessellationThreadLocalData*)
950 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
951 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
952 }
953 }
954
955 //////////////////////////////////////////////////////////////////////////
956 /// @brief Implements Tessellation Stages.
957 /// @param pDC - pointer to draw context.
958 /// @param workerId - thread's worker id. Even thread has a unique id.
959 /// @param pa - The primitive assembly object.
960 /// @param pGsOut - output stream for GS
961 template <
962 typename HasGeometryShaderT,
963 typename HasStreamOutT,
964 typename HasRastT>
965 static void TessellationStages(
966 DRAW_CONTEXT *pDC,
967 uint32_t workerId,
968 PA_STATE& pa,
969 void* pGsOut,
970 void* pCutBuffer,
971 void* pCutStreamBuffer,
972 uint32_t* pSoPrimData,
973 simdscalari primID)
974 {
975 const API_STATE& state = GetApiState(pDC);
976 const SWR_TS_STATE& tsState = state.tsState;
977 SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
978
979 SWR_ASSERT(gt_pTessellationThreadData);
980
981 HANDLE tsCtx = TSInitCtx(
982 tsState.domain,
983 tsState.partitioning,
984 tsState.tsOutputTopology,
985 gt_pTessellationThreadData->pTxCtx,
986 gt_pTessellationThreadData->tsCtxSize);
987 if (tsCtx == nullptr)
988 {
989 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
990 tsCtx = TSInitCtx(
991 tsState.domain,
992 tsState.partitioning,
993 tsState.tsOutputTopology,
994 gt_pTessellationThreadData->pTxCtx,
995 gt_pTessellationThreadData->tsCtxSize);
996 }
997 SWR_ASSERT(tsCtx);
998
999 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1000 if (HasRastT::value)
1001 {
1002 switch (tsState.postDSTopology)
1003 {
1004 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1005 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1006 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1007 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1008 }
1009 }
1010
1011 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1012 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1013 hsContext.PrimitiveID = primID;
1014
1015 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1016 // Max storage for one attribute for an entire simdprimitive
1017 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1018
1019 // assemble all attributes for the input primitives
1020 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1021 {
1022 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1023 pa.Assemble(attribSlot, simdattrib);
1024
1025 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1026 {
1027 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1028 }
1029 }
1030
1031 #if defined(_DEBUG)
1032 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1033 #endif
1034
1035 uint32_t numPrims = pa.NumPrims();
1036 hsContext.mask = GenerateMask(numPrims);
1037
1038 // Run the HS
1039 RDTSC_START(FEHullShader);
1040 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1041 RDTSC_STOP(FEHullShader, 0, 0);
1042
1043 UPDATE_STAT(HsInvocations, numPrims);
1044
1045 const uint32_t* pPrimId = (const uint32_t*)&primID;
1046
1047 for (uint32_t p = 0; p < numPrims; ++p)
1048 {
1049 // Run Tessellator
1050 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1051 RDTSC_START(FETessellation);
1052 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1053 RDTSC_STOP(FETessellation, 0, 0);
1054
1055 if (tsData.NumPrimitives == 0)
1056 {
1057 continue;
1058 }
1059 SWR_ASSERT(tsData.NumDomainPoints);
1060
1061 // Allocate DS Output memory
1062 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1063 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1064 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1065 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1066 {
1067 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1068 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1069 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1070 }
1071 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1072 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1073
1074 #if defined(_DEBUG)
1075 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1076 #endif
1077
1078 // Run Domain Shader
1079 SWR_DS_CONTEXT dsContext;
1080 dsContext.PrimitiveID = pPrimId[p];
1081 dsContext.pCpIn = &hsContext.pCPout[p];
1082 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1083 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1084 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1085 dsContext.vectorStride = requiredDSVectorInvocations;
1086
1087 uint32_t dsInvocations = 0;
1088
1089 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1090 {
1091 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1092
1093 RDTSC_START(FEDomainShader);
1094 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1095 RDTSC_STOP(FEDomainShader, 0, 0);
1096
1097 dsInvocations += KNOB_SIMD_WIDTH;
1098 }
1099 UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1100
1101 PA_TESS tessPa(
1102 pDC,
1103 dsContext.pOutputData,
1104 dsContext.vectorStride,
1105 tsState.numDsOutputAttribs,
1106 tsData.ppIndices,
1107 tsData.NumPrimitives,
1108 tsState.postDSTopology);
1109
1110 while (tessPa.HasWork())
1111 {
1112 if (HasGeometryShaderT::value)
1113 {
1114 GeometryShaderStage<HasStreamOutT, HasRastT>(
1115 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1116 _simd_set1_epi32(dsContext.PrimitiveID));
1117 }
1118 else
1119 {
1120 if (HasStreamOutT::value)
1121 {
1122 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1123 }
1124
1125 if (HasRastT::value)
1126 {
1127 simdvector prim[3]; // Only deal with triangles, lines, or points
1128 RDTSC_START(FEPAAssemble);
1129 #if SWR_ENABLE_ASSERTS
1130 bool assemble =
1131 #endif
1132 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1133 RDTSC_STOP(FEPAAssemble, 1, 0);
1134 SWR_ASSERT(assemble);
1135
1136 SWR_ASSERT(pfnClipFunc);
1137 pfnClipFunc(pDC, tessPa, workerId, prim,
1138 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1139 }
1140 }
1141
1142 tessPa.NextPrim();
1143
1144 } // while (tessPa.HasWork())
1145 } // for (uint32_t p = 0; p < numPrims; ++p)
1146
1147 TSDestroyCtx(tsCtx);
1148 }
1149
1150 //////////////////////////////////////////////////////////////////////////
1151 /// @brief FE handler for SwrDraw.
1152 /// @tparam IsIndexedT - Is indexed drawing enabled
1153 /// @tparam HasTessellationT - Is tessellation enabled
1154 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1155 /// @tparam HasStreamOutT - Is stream-out enabled
1156 /// @tparam HasRastT - Is rasterization enabled
1157 /// @param pContext - pointer to SWR context.
1158 /// @param pDC - pointer to draw context.
1159 /// @param workerId - thread's worker id.
1160 /// @param pUserData - Pointer to DRAW_WORK
1161 template <
1162 typename IsIndexedT,
1163 typename IsCutIndexEnabledT,
1164 typename HasTessellationT,
1165 typename HasGeometryShaderT,
1166 typename HasStreamOutT,
1167 typename HasRastT>
1168 void ProcessDraw(
1169 SWR_CONTEXT *pContext,
1170 DRAW_CONTEXT *pDC,
1171 uint32_t workerId,
1172 void *pUserData)
1173 {
1174
1175 #if KNOB_ENABLE_TOSS_POINTS
1176 if (KNOB_TOSS_QUEUE_FE)
1177 {
1178 return;
1179 }
1180 #endif
1181
1182 RDTSC_START(FEProcessDraw);
1183
1184 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1185 const API_STATE& state = GetApiState(pDC);
1186 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1187 SWR_VS_CONTEXT vsContext;
1188 simdvertex vin;
1189
1190 int indexSize = 0;
1191 uint32_t endVertex = work.numVerts;
1192
1193 const int32_t* pLastRequestedIndex = nullptr;
1194 if (IsIndexedT::value)
1195 {
1196 switch (work.type)
1197 {
1198 case R32_UINT:
1199 indexSize = sizeof(uint32_t);
1200 pLastRequestedIndex = &(work.pIB[endVertex]);
1201 break;
1202 case R16_UINT:
1203 indexSize = sizeof(uint16_t);
1204 // nasty address offset to last index
1205 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1206 break;
1207 case R8_UINT:
1208 indexSize = sizeof(uint8_t);
1209 // nasty address offset to last index
1210 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1211 break;
1212 default:
1213 SWR_ASSERT(0);
1214 }
1215 }
1216 else
1217 {
1218 // No cuts, prune partial primitives.
1219 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1220 }
1221
1222 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1223 fetchInfo.pStreams = &state.vertexBuffers[0];
1224 fetchInfo.StartInstance = work.startInstance;
1225 fetchInfo.StartVertex = 0;
1226
1227 vsContext.pVin = &vin;
1228
1229 if (IsIndexedT::value)
1230 {
1231 fetchInfo.BaseVertex = work.baseVertex;
1232
1233 // if the entire index buffer isn't being consumed, set the last index
1234 // so that fetches < a SIMD wide will be masked off
1235 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1236 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1237 {
1238 fetchInfo.pLastIndex = pLastRequestedIndex;
1239 }
1240 }
1241 else
1242 {
1243 fetchInfo.StartVertex = work.startVertex;
1244 }
1245
1246 #ifdef KNOB_ENABLE_RDTSC
1247 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1248 #endif
1249
1250 void* pGsOut = nullptr;
1251 void* pCutBuffer = nullptr;
1252 void* pStreamCutBuffer = nullptr;
1253 if (HasGeometryShaderT::value)
1254 {
1255 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1256 }
1257
1258 if (HasTessellationT::value)
1259 {
1260 SWR_ASSERT(state.tsState.tsEnable == true);
1261 SWR_ASSERT(state.pfnHsFunc != nullptr);
1262 SWR_ASSERT(state.pfnDsFunc != nullptr);
1263
1264 AllocateTessellationData(pContext);
1265 }
1266 else
1267 {
1268 SWR_ASSERT(state.tsState.tsEnable == false);
1269 SWR_ASSERT(state.pfnHsFunc == nullptr);
1270 SWR_ASSERT(state.pfnDsFunc == nullptr);
1271 }
1272
1273 // allocate space for streamout input prim data
1274 uint32_t* pSoPrimData = nullptr;
1275 if (HasStreamOutT::value)
1276 {
1277 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1278
1279 // update the
1280 for (uint32_t i = 0; i < 4; ++i)
1281 {
1282 SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
1283 }
1284
1285 }
1286
1287 // choose primitive assembler
1288 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1289 PA_STATE& pa = paFactory.GetPA();
1290
1291 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1292 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1293 {
1294 simdscalari vIndex;
1295 uint32_t i = 0;
1296
1297 if (IsIndexedT::value)
1298 {
1299 fetchInfo.pIndices = work.pIB;
1300 }
1301 else
1302 {
1303 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1304 fetchInfo.pIndices = (const int32_t*)&vIndex;
1305 }
1306
1307 fetchInfo.CurInstance = instanceNum;
1308 vsContext.InstanceID = instanceNum;
1309
1310 while (pa.HasWork())
1311 {
1312 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1313 // So we need to keep this outside of (i < endVertex) check.
1314 simdmask* pvCutIndices = nullptr;
1315 if (IsIndexedT::value)
1316 {
1317 pvCutIndices = &pa.GetNextVsIndices();
1318 }
1319
1320 simdvertex& vout = pa.GetNextVsOutput();
1321 vsContext.pVout = &vout;
1322
1323 if (i < endVertex)
1324 {
1325
1326 // 1. Execute FS/VS for a single SIMD.
1327 RDTSC_START(FEFetchShader);
1328 state.pfnFetchFunc(fetchInfo, vin);
1329 RDTSC_STOP(FEFetchShader, 0, 0);
1330
1331 // forward fetch generated vertex IDs to the vertex shader
1332 vsContext.VertexID = fetchInfo.VertexID;
1333
1334 // Setup active mask for vertex shader.
1335 vsContext.mask = GenerateMask(endVertex - i);
1336
1337 // forward cut mask to the PA
1338 if (IsIndexedT::value)
1339 {
1340 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1341 }
1342
1343 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1344
1345 #if KNOB_ENABLE_TOSS_POINTS
1346 if (!KNOB_TOSS_FETCH)
1347 #endif
1348 {
1349 RDTSC_START(FEVertexShader);
1350 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1351 RDTSC_STOP(FEVertexShader, 0, 0);
1352
1353 UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1354 }
1355 }
1356
1357 // 2. Assemble primitives given the last two SIMD.
1358 do
1359 {
1360 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1361 // PaAssemble returns false if there is not enough verts to assemble.
1362 RDTSC_START(FEPAAssemble);
1363 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1364 RDTSC_STOP(FEPAAssemble, 1, 0);
1365
1366 #if KNOB_ENABLE_TOSS_POINTS
1367 if (!KNOB_TOSS_FETCH)
1368 #endif
1369 {
1370 #if KNOB_ENABLE_TOSS_POINTS
1371 if (!KNOB_TOSS_VS)
1372 #endif
1373 {
1374 if (assemble)
1375 {
1376 UPDATE_STAT(IaPrimitives, pa.NumPrims());
1377
1378 if (HasTessellationT::value)
1379 {
1380 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1381 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1382 }
1383 else if (HasGeometryShaderT::value)
1384 {
1385 GeometryShaderStage<HasStreamOutT, HasRastT>(
1386 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1387 }
1388 else
1389 {
1390 // If streamout is enabled then stream vertices out to memory.
1391 if (HasStreamOutT::value)
1392 {
1393 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1394 }
1395
1396 if (HasRastT::value)
1397 {
1398 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1399 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1400 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1401 }
1402 }
1403 }
1404 }
1405 }
1406 } while (pa.NextPrim());
1407
1408 i += KNOB_SIMD_WIDTH;
1409 if (IsIndexedT::value)
1410 {
1411 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1412 }
1413 else
1414 {
1415 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1416 }
1417 }
1418 pa.Reset();
1419 }
1420
1421 RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1422 }
1423
1424 struct FEDrawChooser
1425 {
1426 typedef PFN_FE_WORK_FUNC FuncType;
1427
1428 template <typename... ArgsB>
1429 static FuncType GetFunc()
1430 {
1431 return ProcessDraw<ArgsB...>;
1432 }
1433 };
1434
1435
1436 // Selector for correct templated Draw front-end function
1437 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1438 bool IsIndexed,
1439 bool IsCutIndexEnabled,
1440 bool HasTessellation,
1441 bool HasGeometryShader,
1442 bool HasStreamOut,
1443 bool HasRasterization)
1444 {
1445 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1446 }
1447
1448
1449 //////////////////////////////////////////////////////////////////////////
1450 /// @brief Processes attributes for the backend based on linkage mask and
1451 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1452 /// @param pDC - Draw context
1453 /// @param pa - Primitive Assembly state
1454 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1455 /// @param pLinkageMap - maps VS attribute slot to PS slot
1456 /// @param triIndex - Triangle to process attributes for
1457 /// @param pBuffer - Output result
1458 template<uint32_t NumVerts>
1459 INLINE void ProcessAttributes(
1460 DRAW_CONTEXT *pDC,
1461 PA_STATE&pa,
1462 uint32_t linkageMask,
1463 const uint8_t* pLinkageMap,
1464 uint32_t triIndex,
1465 float *pBuffer)
1466 {
1467 DWORD slot = 0;
1468 uint32_t mapIdx = 0;
1469 LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
1470 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1471 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1472
1473 while (_BitScanForward(&slot, linkageMask))
1474 {
1475 linkageMask &= ~(1 << slot); // done with this bit.
1476
1477 // compute absolute slot in vertex attrib array
1478 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
1479
1480 __m128 attrib[3]; // triangle attribs (always 4 wide)
1481
1482 if (_bittest(&constantInterpMask, mapIdx))
1483 {
1484 uint32_t vid;
1485 static const uint32_t tristripProvokingVertex[] = {0, 2, 1};
1486 static const int32_t quadProvokingTri[2][4] = {{0, 0, 0, 1}, {0, -1, 0, 0}};
1487 static const uint32_t quadProvokingVertex[2][4] = {{0, 1, 2, 2}, {0, 1, 1, 2}};
1488 static const int32_t qstripProvokingTri[2][4] = {{0, 0, 0, 1}, {-1, 0, 0, 0}};
1489 static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}};
1490
1491 switch (topo) {
1492 case TOP_QUAD_LIST:
1493 pa.AssembleSingle(inputSlot,
1494 triIndex + quadProvokingTri[triIndex & 1][provokingVertex],
1495 attrib);
1496 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1497 break;
1498 case TOP_QUAD_STRIP:
1499 pa.AssembleSingle(inputSlot,
1500 triIndex + qstripProvokingTri[triIndex & 1][provokingVertex],
1501 attrib);
1502 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1503 break;
1504 case TOP_TRIANGLE_STRIP:
1505 pa.AssembleSingle(inputSlot, triIndex, attrib);
1506 vid = (triIndex & 1)
1507 ? tristripProvokingVertex[provokingVertex]
1508 : provokingVertex;
1509 break;
1510 default:
1511 pa.AssembleSingle(inputSlot, triIndex, attrib);
1512 vid = provokingVertex;
1513 break;
1514 }
1515
1516 for (uint32_t i = 0; i < NumVerts; ++i)
1517 {
1518 _mm_store_ps(pBuffer, attrib[vid]);
1519 pBuffer += 4;
1520 }
1521 }
1522 else
1523 {
1524 pa.AssembleSingle(inputSlot, triIndex, attrib);
1525
1526 for (uint32_t i = 0; i < NumVerts; ++i)
1527 {
1528 _mm_store_ps(pBuffer, attrib[i]);
1529 pBuffer += 4;
1530 }
1531 }
1532
1533 // pad out the attrib buffer to 3 verts to ensure the triangle
1534 // interpolation code in the pixel shader works correctly for the
1535 // 3 topologies - point, line, tri. This effectively zeros out the
1536 // effect of the missing vertices in the triangle interpolation.
1537 for (uint32_t i = NumVerts; i < 3; ++i)
1538 {
1539 _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
1540 pBuffer += 4;
1541 }
1542
1543 mapIdx++;
1544 }
1545 }
1546
1547 //////////////////////////////////////////////////////////////////////////
1548 /// @brief Processes enabled user clip distances. Loads the active clip
1549 /// distances from the PA, sets up barycentric equations, and
1550 /// stores the results to the output buffer
1551 /// @param pa - Primitive Assembly state
1552 /// @param primIndex - primitive index to process
1553 /// @param clipDistMask - mask of enabled clip distances
1554 /// @param pUserClipBuffer - buffer to store results
1555 template<uint32_t NumVerts>
1556 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1557 {
1558 DWORD clipDist;
1559 while (_BitScanForward(&clipDist, clipDistMask))
1560 {
1561 clipDistMask &= ~(1 << clipDist);
1562 uint32_t clipSlot = clipDist >> 2;
1563 uint32_t clipComp = clipDist & 0x3;
1564 uint32_t clipAttribSlot = clipSlot == 0 ?
1565 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1566
1567 __m128 primClipDist[3];
1568 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1569
1570 float vertClipDist[NumVerts];
1571 for (uint32_t e = 0; e < NumVerts; ++e)
1572 {
1573 OSALIGNSIMD(float) aVertClipDist[4];
1574 _mm_store_ps(aVertClipDist, primClipDist[e]);
1575 vertClipDist[e] = aVertClipDist[clipComp];
1576 };
1577
1578 // setup plane equations for barycentric interpolation in the backend
1579 float baryCoeff[NumVerts];
1580 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1581 {
1582 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1583 }
1584 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1585
1586 for (uint32_t e = 0; e < NumVerts; ++e)
1587 {
1588 *(pUserClipBuffer++) = baryCoeff[e];
1589 }
1590 }
1591 }
1592
1593 //////////////////////////////////////////////////////////////////////////
1594 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1595 /// Point precision from FP32.
1596 template <typename PT = FixedPointTraits<Fixed_16_8>>
1597 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1598 {
1599 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1600 return _simd_cvtps_epi32(vFixed);
1601 }
1602
1603 //////////////////////////////////////////////////////////////////////////
1604 /// @brief Helper function to set the X,Y coords of a triangle to the
1605 /// requested Fixed Point precision from FP32. If the RequestedT
1606 /// FixedPointTraits precision is the same as the CurrentT, no extra
1607 /// conversions will be done. If they are different, convert from FP32
1608 /// to the Requested precision and set vXi, vYi
1609 /// @tparam RequestedT: requested FixedPointTraits type
1610 /// @tparam CurrentT: FixedPointTraits type of the last
1611 template<typename RequestedT, typename CurrentT = FixedPointTraits<Fixed_Uninit>>
1612 struct FPToFixedPoint
1613 {
1614 //////////////////////////////////////////////////////////////////////////
1615 /// @param tri: simdvector[3] of FP triangle verts
1616 /// @param vXi: fixed point X coords of tri verts
1617 /// @param vYi: fixed point Y coords of tri verts
1618 INLINE static void Set(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1619 {
1620 vXi[0] = fpToFixedPointVertical<RequestedT>(tri[0].x);
1621 vYi[0] = fpToFixedPointVertical<RequestedT>(tri[0].y);
1622 vXi[1] = fpToFixedPointVertical<RequestedT>(tri[1].x);
1623 vYi[1] = fpToFixedPointVertical<RequestedT>(tri[1].y);
1624 vXi[2] = fpToFixedPointVertical<RequestedT>(tri[2].x);
1625 vYi[2] = fpToFixedPointVertical<RequestedT>(tri[2].y);
1626 };
1627 };
1628
1629 //////////////////////////////////////////////////////////////////////////
1630 /// @brief In the case where the RequestedT and CurrentT fixed point
1631 /// precisions are the same, do nothing.
1632 template<typename RequestedT>
1633 struct FPToFixedPoint<RequestedT, RequestedT>
1634 {
1635 INLINE static void Set(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3]){};
1636 };
1637
1638 //////////////////////////////////////////////////////////////////////////
1639 /// @brief Calculate bounding box for current triangle
1640 /// @tparam CT: ConservativeRastFETraits type
1641 /// @param vX: fixed point X position for triangle verts
1642 /// @param vY: fixed point Y position for triangle verts
1643 /// @param bbox: fixed point bbox
1644 /// *Note*: expects vX, vY to be in the correct precision for the type
1645 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1646 template <typename CT>
1647 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox){}
1648
1649 //////////////////////////////////////////////////////////////////////////
1650 /// @brief FEStandardRastT specialization of calcBoundingBoxIntVertical
1651 template <>
1652 INLINE void calcBoundingBoxIntVertical<FEStandardRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1653 {
1654 // FE conservative rast traits
1655 typedef FEStandardRastT CT;
1656
1657 static_assert(std::is_same<CT::BBoxPrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Standard rast BBox calculation needs to be in 16.8 precision");
1658 // Update vXi, vYi fixed point precision for BBox calculation if necessary
1659 FPToFixedPoint<CT::BBoxPrecisionT, CT::ZeroAreaPrecisionT>::Set(tri, vX, vY);
1660
1661 simdscalari vMinX = vX[0];
1662 vMinX = _simd_min_epi32(vMinX, vX[1]);
1663 vMinX = _simd_min_epi32(vMinX, vX[2]);
1664
1665 simdscalari vMaxX = vX[0];
1666 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1667 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1668
1669 simdscalari vMinY = vY[0];
1670 vMinY = _simd_min_epi32(vMinY, vY[1]);
1671 vMinY = _simd_min_epi32(vMinY, vY[2]);
1672
1673 simdscalari vMaxY = vY[0];
1674 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1675 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1676
1677 bbox.left = vMinX;
1678 bbox.right = vMaxX;
1679 bbox.top = vMinY;
1680 bbox.bottom = vMaxY;
1681 }
1682
1683 //////////////////////////////////////////////////////////////////////////
1684 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1685 /// Offsets BBox for conservative rast
1686 template <>
1687 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1688 {
1689 // FE conservative rast traits
1690 typedef FEConservativeRastT CT;
1691
1692 static_assert(std::is_same<CT::BBoxPrecisionT, FixedPointTraits<Fixed_16_9>>::value, "Conservative rast BBox calculation needs to be in 16.9 precision");
1693 // Update vXi, vYi fixed point precision for BBox calculation if necessary
1694 FPToFixedPoint<CT::BBoxPrecisionT, CT::ZeroAreaPrecisionT>::Set(tri, vX, vY);
1695
1696 simdscalari vMinX = vX[0];
1697 vMinX = _simd_min_epi32(vMinX, vX[1]);
1698 vMinX = _simd_min_epi32(vMinX, vX[2]);
1699
1700 simdscalari vMaxX = vX[0];
1701 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1702 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1703
1704 simdscalari vMinY = vY[0];
1705 vMinY = _simd_min_epi32(vMinY, vY[1]);
1706 vMinY = _simd_min_epi32(vMinY, vY[2]);
1707
1708 simdscalari vMaxY = vY[0];
1709 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1710 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1711
1712 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1713 bbox.left = _simd_srli_epi32(_simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1714 bbox.right = _simd_srli_epi32(_simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1715 bbox.top = _simd_srli_epi32(_simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1716 bbox.bottom = _simd_srli_epi32(_simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1717 }
1718
1719 //////////////////////////////////////////////////////////////////////////
1720 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1721 /// culling, viewport transform, etc.
1722 /// @param pDC - pointer to draw context.
1723 /// @param pa - The primitive assembly object.
1724 /// @param workerId - thread's worker id. Even thread has a unique id.
1725 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1726 /// @param primID - Primitive ID for each triangle.
1727 /// @tparam CT - ConservativeRastFETraits
1728 template <typename CT>
1729 void BinTriangles(
1730 DRAW_CONTEXT *pDC,
1731 PA_STATE& pa,
1732 uint32_t workerId,
1733 simdvector tri[3],
1734 uint32_t triMask,
1735 simdscalari primID)
1736 {
1737 RDTSC_START(FEBinTriangles);
1738
1739 const API_STATE& state = GetApiState(pDC);
1740 const SWR_RASTSTATE& rastState = state.rastState;
1741 const SWR_FRONTEND_STATE& feState = state.frontendState;
1742 const SWR_GS_STATE& gsState = state.gsState;
1743 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1744
1745
1746 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1747 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1748 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1749
1750 if (!feState.vpTransformDisable)
1751 {
1752 // perspective divide
1753 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1754 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1755 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1756
1757 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1758 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1759 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1760
1761 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1762 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1763 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1764
1765 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1766 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1767 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1768
1769 // viewport transform to screen coords
1770 viewportTransform<3>(tri, state.vpMatrix[0]);
1771 }
1772
1773 // adjust for pixel center location
1774 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1775 tri[0].x = _simd_add_ps(tri[0].x, offset);
1776 tri[0].y = _simd_add_ps(tri[0].y, offset);
1777
1778 tri[1].x = _simd_add_ps(tri[1].x, offset);
1779 tri[1].y = _simd_add_ps(tri[1].y, offset);
1780
1781 tri[2].x = _simd_add_ps(tri[2].x, offset);
1782 tri[2].y = _simd_add_ps(tri[2].y, offset);
1783
1784 simdscalari vXi[3], vYi[3];
1785 // Set vXi, vYi to fixed point precision required for degenerate triangle check
1786 FPToFixedPoint<typename CT::ZeroAreaPrecisionT>::Set(tri, vXi, vYi);
1787
1788 // triangle setup
1789 simdscalari vAi[3], vBi[3];
1790 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1791
1792 // determinant
1793 simdscalari vDet[2];
1794 calcDeterminantIntVertical(vAi, vBi, vDet);
1795
1796 /// todo: handle degen tri's for Conservative Rast.
1797
1798 // cull zero area
1799 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1800 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1801
1802 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1803
1804 uint32_t origTriMask = triMask;
1805 triMask &= ~cullZeroAreaMask;
1806
1807 // determine front winding tris
1808 // CW +det
1809 // CCW -det
1810 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1811 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1812 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1813
1814 uint32_t frontWindingTris;
1815 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1816 {
1817 frontWindingTris = cwTriMask;
1818 }
1819 else
1820 {
1821 frontWindingTris = ~cwTriMask;
1822 }
1823
1824 // cull
1825 uint32_t cullTris;
1826 switch ((SWR_CULLMODE)rastState.cullMode)
1827 {
1828 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1829 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1830 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1831 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1832 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1833 }
1834
1835 triMask &= ~cullTris;
1836
1837 if (origTriMask ^ triMask)
1838 {
1839 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1840 }
1841
1842 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1843 // compute per tri backface
1844 uint32_t frontFaceMask = frontWindingTris;
1845 uint32_t *pPrimID = (uint32_t *)&primID;
1846 DWORD triIndex = 0;
1847 // for center sample pattern, all samples are at pixel center; calculate coverage
1848 // once at center and broadcast the results in the backend
1849 uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1850 PFN_WORK_FUNC pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1851 pDC->pState->state.psState.inputCoverage, (rastState.scissorEnable > 0));
1852 if (!triMask)
1853 {
1854 goto endBinTriangles;
1855 }
1856
1857 // Calc bounding box of triangles
1858 simdBBox bbox;
1859 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1860
1861 // determine if triangle falls between pixel centers and discard
1862 // only discard for non-MSAA case and when conservative rast is disabled
1863 // (left + 127) & ~255
1864 // (right + 128) & ~255
1865 if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1866 {
1867 origTriMask = triMask;
1868
1869 int cullCenterMask;
1870 {
1871 simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1872 left = _simd_and_si(left, _simd_set1_epi32(~255));
1873 simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1874 right = _simd_and_si(right, _simd_set1_epi32(~255));
1875
1876 simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1877
1878 simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1879 top = _simd_and_si(top, _simd_set1_epi32(~255));
1880 simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1881 bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1882
1883 simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1884 vMaskV = _simd_or_si(vMaskH, vMaskV);
1885 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1886 }
1887
1888 triMask &= ~cullCenterMask;
1889
1890 if(origTriMask ^ triMask)
1891 {
1892 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1893 }
1894 }
1895
1896 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1897 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1898 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1899 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1900 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1901
1902 // Cull tris completely outside scissor
1903 {
1904 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1905 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1906 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1907 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1908 triMask = triMask & ~maskOutsideScissor;
1909 }
1910
1911 if (!triMask)
1912 {
1913 goto endBinTriangles;
1914 }
1915
1916 // Convert triangle bbox to macrotile units.
1917 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1918 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1919 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1920 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1921
1922 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1923 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
1924 _simd_store_si((simdscalari*)aMTRight, bbox.right);
1925 _simd_store_si((simdscalari*)aMTTop, bbox.top);
1926 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
1927
1928 // transpose verts needed for backend
1929 /// @todo modify BE to take non-transformed verts
1930 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1931 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1932 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1933 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1934 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1935
1936 // store render target array index
1937 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1938 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1939 {
1940 simdvector vRtai[3];
1941 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1942 simdscalari vRtaii;
1943 vRtaii = _simd_castps_si(vRtai[0].x);
1944 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1945 }
1946 else
1947 {
1948 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1949 }
1950
1951 // scan remaining valid triangles and bin each separately
1952 while (_BitScanForward(&triIndex, triMask))
1953 {
1954 uint32_t linkageCount = state.linkageCount;
1955 uint32_t linkageMask = state.linkageMask;
1956 uint32_t numScalarAttribs = linkageCount * 4;
1957
1958 BE_WORK work;
1959 work.type = DRAW;
1960 work.pfnWork = pfnWork;
1961
1962 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1963
1964 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1965 desc.triFlags.primID = pPrimID[triIndex];
1966 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1967
1968 auto pArena = pDC->pArena;
1969 SWR_ASSERT(pArena != nullptr);
1970
1971 // store active attribs
1972 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1973 desc.pAttribs = pAttribs;
1974 desc.numAttribs = linkageCount;
1975 ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
1976
1977 // store triangle vertex data
1978 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1979
1980 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1981 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1982 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1983 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1984
1985 // store user clip distances
1986 if (rastState.clipDistanceMask)
1987 {
1988 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1989 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1990 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
1991 }
1992
1993 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1994 {
1995 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1996 {
1997 #if KNOB_ENABLE_TOSS_POINTS
1998 if (!KNOB_TOSS_SETUP_TRIS)
1999 #endif
2000 {
2001 pTileMgr->enqueue(x, y, &work);
2002 }
2003 }
2004 }
2005 triMask &= ~(1 << triIndex);
2006 }
2007
2008 endBinTriangles:
2009 RDTSC_STOP(FEBinTriangles, 1, 0);
2010 }
2011
2012 struct FEBinTrianglesChooser
2013 {
2014 typedef PFN_PROCESS_PRIMS FuncType;
2015
2016 template <typename... ArgsB>
2017 static FuncType GetFunc()
2018 {
2019 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2020 }
2021 };
2022
2023 // Selector for correct templated BinTrinagles function
2024 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2025 {
2026 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2027 }
2028
2029 //////////////////////////////////////////////////////////////////////////
2030 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2031 /// @param pDC - pointer to draw context.
2032 /// @param pa - The primitive assembly object.
2033 /// @param workerId - thread's worker id. Even thread has a unique id.
2034 /// @param tri - Contains point position data for SIMDs worth of points.
2035 /// @param primID - Primitive ID for each point.
2036 void BinPoints(
2037 DRAW_CONTEXT *pDC,
2038 PA_STATE& pa,
2039 uint32_t workerId,
2040 simdvector prim[3],
2041 uint32_t primMask,
2042 simdscalari primID)
2043 {
2044 RDTSC_START(FEBinPoints);
2045
2046 simdvector& primVerts = prim[0];
2047
2048 const API_STATE& state = GetApiState(pDC);
2049 const SWR_FRONTEND_STATE& feState = state.frontendState;
2050 const SWR_GS_STATE& gsState = state.gsState;
2051 const SWR_RASTSTATE& rastState = state.rastState;
2052
2053 if (!feState.vpTransformDisable)
2054 {
2055 // perspective divide
2056 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2057 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2058 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2059 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2060
2061 // viewport transform to screen coords
2062 viewportTransform<1>(&primVerts, state.vpMatrix[0]);
2063 }
2064
2065 // adjust for pixel center location
2066 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2067 primVerts.x = _simd_add_ps(primVerts.x, offset);
2068 primVerts.y = _simd_add_ps(primVerts.y, offset);
2069
2070 // convert to fixed point
2071 simdscalari vXi, vYi;
2072 vXi = fpToFixedPointVertical(primVerts.x);
2073 vYi = fpToFixedPointVertical(primVerts.y);
2074
2075 if (CanUseSimplePoints(pDC))
2076 {
2077 // adjust for top-left rule
2078 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2079 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2080
2081 // cull points off the top-left edge of the viewport
2082 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2083 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2084
2085 // compute macro tile coordinates
2086 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2087 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2088
2089 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2090 _simd_store_si((simdscalari*)aMacroX, macroX);
2091 _simd_store_si((simdscalari*)aMacroY, macroY);
2092
2093 // compute raster tile coordinates
2094 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2095 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2096
2097 // compute raster tile relative x,y for coverage mask
2098 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2099 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2100
2101 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2102 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2103
2104 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2105 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2106 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2107 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2108
2109 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2110 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2111 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2112 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2113
2114 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2115 _simd_store_ps((float*)aZ, primVerts.z);
2116
2117 // store render target array index
2118 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2119 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2120 {
2121 simdvector vRtai;
2122 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2123 simdscalari vRtaii = _simd_castps_si(vRtai.x);
2124 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2125 }
2126 else
2127 {
2128 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2129 }
2130
2131 uint32_t *pPrimID = (uint32_t *)&primID;
2132 DWORD primIndex = 0;
2133 // scan remaining valid triangles and bin each separately
2134 while (_BitScanForward(&primIndex, primMask))
2135 {
2136 uint32_t linkageCount = state.linkageCount;
2137 uint32_t linkageMask = state.linkageMask;
2138
2139 uint32_t numScalarAttribs = linkageCount * 4;
2140
2141 BE_WORK work;
2142 work.type = DRAW;
2143
2144 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2145
2146 // points are always front facing
2147 desc.triFlags.frontFacing = 1;
2148 desc.triFlags.primID = pPrimID[primIndex];
2149 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2150
2151 work.pfnWork = RasterizeSimplePoint;
2152
2153 auto pArena = pDC->pArena;
2154 SWR_ASSERT(pArena != nullptr);
2155
2156 // store attributes
2157 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2158 desc.pAttribs = pAttribs;
2159 desc.numAttribs = linkageCount;
2160
2161 ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
2162
2163 // store raster tile aligned x, y, perspective correct z
2164 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2165 desc.pTriBuffer = pTriBuffer;
2166 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2167 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2168 *pTriBuffer = aZ[primIndex];
2169
2170 uint32_t tX = aTileRelativeX[primIndex];
2171 uint32_t tY = aTileRelativeY[primIndex];
2172
2173 // pack the relative x,y into the coverageMask, the rasterizer will
2174 // generate the true coverage mask from it
2175 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2176
2177 // bin it
2178 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2179 #if KNOB_ENABLE_TOSS_POINTS
2180 if (!KNOB_TOSS_SETUP_TRIS)
2181 #endif
2182 {
2183 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2184 }
2185 primMask &= ~(1 << primIndex);
2186 }
2187 }
2188 else
2189 {
2190 // non simple points need to be potentially binned to multiple macro tiles
2191 simdscalar vPointSize;
2192 if (rastState.pointParam)
2193 {
2194 simdvector size[3];
2195 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2196 vPointSize = size[0].x;
2197 }
2198 else
2199 {
2200 vPointSize = _simd_set1_ps(rastState.pointSize);
2201 }
2202
2203 // bloat point to bbox
2204 simdBBox bbox;
2205 bbox.left = bbox.right = vXi;
2206 bbox.top = bbox.bottom = vYi;
2207
2208 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2209 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2210 bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2211 bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2212 bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2213 bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2214
2215 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2216 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2217 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2218 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2219 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2220
2221 // Cull bloated points completely outside scissor
2222 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2223 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2224 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2225 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2226 primMask = primMask & ~maskOutsideScissor;
2227
2228 // Convert bbox to macrotile units.
2229 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2230 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2231 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2232 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2233
2234 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2235 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2236 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2237 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2238 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2239
2240 // store render target array index
2241 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2242 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2243 {
2244 simdvector vRtai[2];
2245 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2246 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2247 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2248 }
2249 else
2250 {
2251 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2252 }
2253
2254 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2255 _simd_store_ps((float*)aPointSize, vPointSize);
2256
2257 uint32_t *pPrimID = (uint32_t *)&primID;
2258
2259 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2260 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2261 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2262
2263 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2264 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2265 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2266
2267 // scan remaining valid prims and bin each separately
2268 DWORD primIndex;
2269 while (_BitScanForward(&primIndex, primMask))
2270 {
2271 uint32_t linkageCount = state.linkageCount;
2272 uint32_t linkageMask = state.linkageMask;
2273 uint32_t numScalarAttribs = linkageCount * 4;
2274
2275 BE_WORK work;
2276 work.type = DRAW;
2277
2278 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2279
2280 desc.triFlags.frontFacing = 1;
2281 desc.triFlags.primID = pPrimID[primIndex];
2282 desc.triFlags.pointSize = aPointSize[primIndex];
2283 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2284
2285 work.pfnWork = RasterizeTriPoint;
2286
2287 auto pArena = pDC->pArena;
2288 SWR_ASSERT(pArena != nullptr);
2289
2290 // store active attribs
2291 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2292 desc.numAttribs = linkageCount;
2293 ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2294
2295 // store point vertex data
2296 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2297 desc.pTriBuffer = pTriBuffer;
2298 *pTriBuffer++ = aPrimVertsX[primIndex];
2299 *pTriBuffer++ = aPrimVertsY[primIndex];
2300 *pTriBuffer = aPrimVertsZ[primIndex];
2301
2302 // store user clip distances
2303 if (rastState.clipDistanceMask)
2304 {
2305 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2306 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2307 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2308 }
2309
2310 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2311 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2312 {
2313 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2314 {
2315 #if KNOB_ENABLE_TOSS_POINTS
2316 if (!KNOB_TOSS_SETUP_TRIS)
2317 #endif
2318 {
2319 pTileMgr->enqueue(x, y, &work);
2320 }
2321 }
2322 }
2323
2324 primMask &= ~(1 << primIndex);
2325 }
2326 }
2327
2328
2329
2330
2331 RDTSC_STOP(FEBinPoints, 1, 0);
2332 }
2333
2334 //////////////////////////////////////////////////////////////////////////
2335 /// @brief Bin SIMD lines to the backend.
2336 /// @param pDC - pointer to draw context.
2337 /// @param pa - The primitive assembly object.
2338 /// @param workerId - thread's worker id. Even thread has a unique id.
2339 /// @param tri - Contains line position data for SIMDs worth of points.
2340 /// @param primID - Primitive ID for each line.
2341 void BinLines(
2342 DRAW_CONTEXT *pDC,
2343 PA_STATE& pa,
2344 uint32_t workerId,
2345 simdvector prim[],
2346 uint32_t primMask,
2347 simdscalari primID)
2348 {
2349 RDTSC_START(FEBinLines);
2350
2351 const API_STATE& state = GetApiState(pDC);
2352 const SWR_RASTSTATE& rastState = state.rastState;
2353 const SWR_FRONTEND_STATE& feState = state.frontendState;
2354 const SWR_GS_STATE& gsState = state.gsState;
2355
2356 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2357 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2358
2359 if (!feState.vpTransformDisable)
2360 {
2361 // perspective divide
2362 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2363 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2364
2365 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2366 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2367
2368 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2369 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2370
2371 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2372 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2373
2374 // viewport transform to screen coords
2375 viewportTransform<2>(prim, state.vpMatrix[0]);
2376 }
2377
2378 // adjust for pixel center location
2379 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2380 prim[0].x = _simd_add_ps(prim[0].x, offset);
2381 prim[0].y = _simd_add_ps(prim[0].y, offset);
2382
2383 prim[1].x = _simd_add_ps(prim[1].x, offset);
2384 prim[1].y = _simd_add_ps(prim[1].y, offset);
2385
2386 // convert to fixed point
2387 simdscalari vXi[2], vYi[2];
2388 vXi[0] = fpToFixedPointVertical(prim[0].x);
2389 vYi[0] = fpToFixedPointVertical(prim[0].y);
2390 vXi[1] = fpToFixedPointVertical(prim[1].x);
2391 vYi[1] = fpToFixedPointVertical(prim[1].y);
2392
2393 // compute x-major vs y-major mask
2394 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2395 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2396 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2397 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2398
2399 // cull zero-length lines
2400 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2401 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2402
2403 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2404
2405 uint32_t *pPrimID = (uint32_t *)&primID;
2406
2407 simdscalar vUnused = _simd_setzero_ps();
2408
2409 // Calc bounding box of lines
2410 simdBBox bbox;
2411 bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2412 bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2413 bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2414 bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2415
2416 // bloat bbox by line width along minor axis
2417 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2418 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2419 simdBBox bloatBox;
2420 bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2421 bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2422 bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2423 bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2424
2425 bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2426 bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2427 bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2428 bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2429
2430 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2431 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2432 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2433 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2434 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2435
2436 // Cull prims completely outside scissor
2437 {
2438 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2439 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2440 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2441 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2442 primMask = primMask & ~maskOutsideScissor;
2443 }
2444
2445 if (!primMask)
2446 {
2447 goto endBinLines;
2448 }
2449
2450 // Convert triangle bbox to macrotile units.
2451 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2452 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2453 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2454 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2455
2456 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2457 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2458 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2459 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2460 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2461
2462 // transpose verts needed for backend
2463 /// @todo modify BE to take non-transformed verts
2464 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2465 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2466 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2467 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2468 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2469
2470 // store render target array index
2471 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2472 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2473 {
2474 simdvector vRtai[2];
2475 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2476 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2477 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2478 }
2479 else
2480 {
2481 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2482 }
2483
2484 // scan remaining valid prims and bin each separately
2485 DWORD primIndex;
2486 while (_BitScanForward(&primIndex, primMask))
2487 {
2488 uint32_t linkageCount = state.linkageCount;
2489 uint32_t linkageMask = state.linkageMask;
2490 uint32_t numScalarAttribs = linkageCount * 4;
2491
2492 BE_WORK work;
2493 work.type = DRAW;
2494
2495 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2496
2497 desc.triFlags.frontFacing = 1;
2498 desc.triFlags.primID = pPrimID[primIndex];
2499 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2500 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2501
2502 work.pfnWork = RasterizeLine;
2503
2504 auto pArena = pDC->pArena;
2505 SWR_ASSERT(pArena != nullptr);
2506
2507 // store active attribs
2508 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2509 desc.numAttribs = linkageCount;
2510 ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2511
2512 // store line vertex data
2513 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2514 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2515 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2516 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2517 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2518
2519 // store user clip distances
2520 if (rastState.clipDistanceMask)
2521 {
2522 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2523 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2524 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2525 }
2526
2527 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2528 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2529 {
2530 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2531 {
2532 #if KNOB_ENABLE_TOSS_POINTS
2533 if (!KNOB_TOSS_SETUP_TRIS)
2534 #endif
2535 {
2536 pTileMgr->enqueue(x, y, &work);
2537 }
2538 }
2539 }
2540
2541 primMask &= ~(1 << primIndex);
2542 }
2543
2544 endBinLines:
2545
2546 RDTSC_STOP(FEBinLines, 1, 0);
2547 }