swr: [rasterizer] Small warning cleanup
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "utils.h"
37 #include "threads.h"
38 #include "pa.h"
39 #include "clip.h"
40 #include "tilemgr.h"
41 #include "tessellator.h"
42 #include <limits>
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
46 static INLINE uint32_t GenMask(uint32_t numBits)
47 {
48 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
49 return ((1U << numBits) - 1);
50 }
51
52 //////////////////////////////////////////////////////////////////////////
53 /// @brief Offsets added to post-viewport vertex positions based on
54 /// raster state.
55 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
56 {
57 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
58 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
59 };
60
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrSync.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
67 /// @todo This should go away when we switch this to use compute threading.
68 void ProcessSync(
69 SWR_CONTEXT *pContext,
70 DRAW_CONTEXT *pDC,
71 uint32_t workerId,
72 void *pUserData)
73 {
74 SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
75 BE_WORK work;
76 work.type = SYNC;
77 work.pfnWork = ProcessSyncBE;
78 work.desc.sync = *pSync;
79
80 MacroTileMgr *pTileMgr = pDC->pTileMgr;
81 pTileMgr->enqueue(0, 0, &work);
82 }
83
84 //////////////////////////////////////////////////////////////////////////
85 /// @brief FE handler for SwrGetStats.
86 /// @param pContext - pointer to SWR context.
87 /// @param pDC - pointer to draw context.
88 /// @param workerId - thread's worker id. Even thread has a unique id.
89 /// @param pUserData - Pointer to user data passed back to stats callback.
90 /// @todo This should go away when we switch this to use compute threading.
91 void ProcessQueryStats(
92 SWR_CONTEXT *pContext,
93 DRAW_CONTEXT *pDC,
94 uint32_t workerId,
95 void *pUserData)
96 {
97 QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
98 BE_WORK work;
99 work.type = QUERYSTATS;
100 work.pfnWork = ProcessQueryStatsBE;
101 work.desc.queryStats = *pQueryStats;
102
103 MacroTileMgr *pTileMgr = pDC->pTileMgr;
104 pTileMgr->enqueue(0, 0, &work);
105 }
106
107 //////////////////////////////////////////////////////////////////////////
108 /// @brief FE handler for SwrClearRenderTarget.
109 /// @param pContext - pointer to SWR context.
110 /// @param pDC - pointer to draw context.
111 /// @param workerId - thread's worker id. Even thread has a unique id.
112 /// @param pUserData - Pointer to user data passed back to clear callback.
113 /// @todo This should go away when we switch this to use compute threading.
114 void ProcessClear(
115 SWR_CONTEXT *pContext,
116 DRAW_CONTEXT *pDC,
117 uint32_t workerId,
118 void *pUserData)
119 {
120 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
121 MacroTileMgr *pTileMgr = pDC->pTileMgr;
122
123 const API_STATE& state = GetApiState(pDC);
124
125 // queue a clear to each macro tile
126 // compute macro tile bounds for the current scissor/viewport
127 uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
128 uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
129 uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
130 uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
131
132 BE_WORK work;
133 work.type = CLEAR;
134 work.pfnWork = ProcessClearBE;
135 work.desc.clear = *pClear;
136
137 for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
138 {
139 for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
140 {
141 pTileMgr->enqueue(x, y, &work);
142 }
143 }
144 }
145
146 //////////////////////////////////////////////////////////////////////////
147 /// @brief FE handler for SwrStoreTiles.
148 /// @param pContext - pointer to SWR context.
149 /// @param pDC - pointer to draw context.
150 /// @param workerId - thread's worker id. Even thread has a unique id.
151 /// @param pUserData - Pointer to user data passed back to callback.
152 /// @todo This should go away when we switch this to use compute threading.
153 void ProcessStoreTiles(
154 SWR_CONTEXT *pContext,
155 DRAW_CONTEXT *pDC,
156 uint32_t workerId,
157 void *pUserData)
158 {
159 RDTSC_START(FEProcessStoreTiles);
160 STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
161 MacroTileMgr *pTileMgr = pDC->pTileMgr;
162
163 const API_STATE& state = GetApiState(pDC);
164
165 // queue a store to each macro tile
166 // compute macro tile bounds for the current render target
167 const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
168 const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
169
170 uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
171 uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
172
173 // store tiles
174 BE_WORK work;
175 work.type = STORETILES;
176 work.pfnWork = ProcessStoreTileBE;
177 work.desc.storeTiles = *pStore;
178
179 for (uint32_t x = 0; x < numMacroTilesX; ++x)
180 {
181 for (uint32_t y = 0; y < numMacroTilesY; ++y)
182 {
183 pTileMgr->enqueue(x, y, &work);
184 }
185 }
186
187 RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
188 }
189
190 //////////////////////////////////////////////////////////////////////////
191 /// @brief FE handler for SwrInvalidateTiles.
192 /// @param pContext - pointer to SWR context.
193 /// @param pDC - pointer to draw context.
194 /// @param workerId - thread's worker id. Even thread has a unique id.
195 /// @param pUserData - Pointer to user data passed back to callback.
196 /// @todo This should go away when we switch this to use compute threading.
197 void ProcessDiscardInvalidateTiles(
198 SWR_CONTEXT *pContext,
199 DRAW_CONTEXT *pDC,
200 uint32_t workerId,
201 void *pUserData)
202 {
203 RDTSC_START(FEProcessInvalidateTiles);
204 DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
205 MacroTileMgr *pTileMgr = pDC->pTileMgr;
206
207 SWR_RECT rect;
208
209 if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
210 {
211 // Valid rect
212 rect = pInv->rect;
213 }
214 else
215 {
216 // Use viewport dimensions
217 const API_STATE& state = GetApiState(pDC);
218
219 rect.left = (uint32_t)state.vp[0].x;
220 rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
221 rect.top = (uint32_t)state.vp[0].y;
222 rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
223 }
224
225 // queue a store to each macro tile
226 // compute macro tile bounds for the current render target
227 uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
228 uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
229
230 // Setup region assuming full tiles
231 uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
232 uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
233
234 uint32_t macroTileEndX = rect.right / macroWidth;
235 uint32_t macroTileEndY = rect.bottom / macroHeight;
236
237 if (pInv->fullTilesOnly == false)
238 {
239 // include partial tiles
240 macroTileStartX = rect.left / macroWidth;
241 macroTileStartY = rect.top / macroHeight;
242
243 macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
244 macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
245 }
246
247 SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
248 SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
249
250 macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
251 macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
252
253 // load tiles
254 BE_WORK work;
255 work.type = DISCARDINVALIDATETILES;
256 work.pfnWork = ProcessDiscardInvalidateTilesBE;
257 work.desc.discardInvalidateTiles = *pInv;
258
259 for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
260 {
261 for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
262 {
263 pTileMgr->enqueue(x, y, &work);
264 }
265 }
266
267 RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
268 }
269
270 //////////////////////////////////////////////////////////////////////////
271 /// @brief Computes the number of primitives given the number of verts.
272 /// @param mode - primitive topology for draw operation.
273 /// @param numPrims - number of vertices or indices for draw.
274 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
275 uint32_t GetNumPrims(
276 PRIMITIVE_TOPOLOGY mode,
277 uint32_t numPrims)
278 {
279 switch (mode)
280 {
281 case TOP_POINT_LIST: return numPrims;
282 case TOP_TRIANGLE_LIST: return numPrims / 3;
283 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
284 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
285 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
286 case TOP_QUAD_LIST: return numPrims / 4;
287 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
288 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
289 case TOP_LINE_LIST: return numPrims / 2;
290 case TOP_LINE_LOOP: return numPrims;
291 case TOP_RECT_LIST: return numPrims / 3;
292 case TOP_LINE_LIST_ADJ: return numPrims / 4;
293 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
294 case TOP_TRI_LIST_ADJ: return numPrims / 6;
295 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
296
297 case TOP_PATCHLIST_1:
298 case TOP_PATCHLIST_2:
299 case TOP_PATCHLIST_3:
300 case TOP_PATCHLIST_4:
301 case TOP_PATCHLIST_5:
302 case TOP_PATCHLIST_6:
303 case TOP_PATCHLIST_7:
304 case TOP_PATCHLIST_8:
305 case TOP_PATCHLIST_9:
306 case TOP_PATCHLIST_10:
307 case TOP_PATCHLIST_11:
308 case TOP_PATCHLIST_12:
309 case TOP_PATCHLIST_13:
310 case TOP_PATCHLIST_14:
311 case TOP_PATCHLIST_15:
312 case TOP_PATCHLIST_16:
313 case TOP_PATCHLIST_17:
314 case TOP_PATCHLIST_18:
315 case TOP_PATCHLIST_19:
316 case TOP_PATCHLIST_20:
317 case TOP_PATCHLIST_21:
318 case TOP_PATCHLIST_22:
319 case TOP_PATCHLIST_23:
320 case TOP_PATCHLIST_24:
321 case TOP_PATCHLIST_25:
322 case TOP_PATCHLIST_26:
323 case TOP_PATCHLIST_27:
324 case TOP_PATCHLIST_28:
325 case TOP_PATCHLIST_29:
326 case TOP_PATCHLIST_30:
327 case TOP_PATCHLIST_31:
328 case TOP_PATCHLIST_32:
329 return numPrims / (mode - TOP_PATCHLIST_BASE);
330
331 case TOP_POLYGON:
332 case TOP_POINT_LIST_BF:
333 case TOP_LINE_STRIP_CONT:
334 case TOP_LINE_STRIP_BF:
335 case TOP_LINE_STRIP_CONT_BF:
336 case TOP_TRIANGLE_FAN_NOSTIPPLE:
337 case TOP_TRI_STRIP_REVERSE:
338 case TOP_PATCHLIST_BASE:
339 case TOP_UNKNOWN:
340 SWR_ASSERT(false, "Unsupported topology: %d", mode);
341 return 0;
342 }
343
344 return 0;
345 }
346
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief Computes the number of verts given the number of primitives.
349 /// @param mode - primitive topology for draw operation.
350 /// @param numPrims - number of primitives for draw.
351 uint32_t GetNumVerts(
352 PRIMITIVE_TOPOLOGY mode,
353 uint32_t numPrims)
354 {
355 switch (mode)
356 {
357 case TOP_POINT_LIST: return numPrims;
358 case TOP_TRIANGLE_LIST: return numPrims * 3;
359 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
360 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
361 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
362 case TOP_QUAD_LIST: return numPrims * 4;
363 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
364 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
365 case TOP_LINE_LIST: return numPrims * 2;
366 case TOP_LINE_LOOP: return numPrims;
367 case TOP_RECT_LIST: return numPrims * 3;
368 case TOP_LINE_LIST_ADJ: return numPrims * 4;
369 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
370 case TOP_TRI_LIST_ADJ: return numPrims * 6;
371 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
372
373 case TOP_PATCHLIST_1:
374 case TOP_PATCHLIST_2:
375 case TOP_PATCHLIST_3:
376 case TOP_PATCHLIST_4:
377 case TOP_PATCHLIST_5:
378 case TOP_PATCHLIST_6:
379 case TOP_PATCHLIST_7:
380 case TOP_PATCHLIST_8:
381 case TOP_PATCHLIST_9:
382 case TOP_PATCHLIST_10:
383 case TOP_PATCHLIST_11:
384 case TOP_PATCHLIST_12:
385 case TOP_PATCHLIST_13:
386 case TOP_PATCHLIST_14:
387 case TOP_PATCHLIST_15:
388 case TOP_PATCHLIST_16:
389 case TOP_PATCHLIST_17:
390 case TOP_PATCHLIST_18:
391 case TOP_PATCHLIST_19:
392 case TOP_PATCHLIST_20:
393 case TOP_PATCHLIST_21:
394 case TOP_PATCHLIST_22:
395 case TOP_PATCHLIST_23:
396 case TOP_PATCHLIST_24:
397 case TOP_PATCHLIST_25:
398 case TOP_PATCHLIST_26:
399 case TOP_PATCHLIST_27:
400 case TOP_PATCHLIST_28:
401 case TOP_PATCHLIST_29:
402 case TOP_PATCHLIST_30:
403 case TOP_PATCHLIST_31:
404 case TOP_PATCHLIST_32:
405 return numPrims * (mode - TOP_PATCHLIST_BASE);
406
407 case TOP_POLYGON:
408 case TOP_POINT_LIST_BF:
409 case TOP_LINE_STRIP_CONT:
410 case TOP_LINE_STRIP_BF:
411 case TOP_LINE_STRIP_CONT_BF:
412 case TOP_TRIANGLE_FAN_NOSTIPPLE:
413 case TOP_TRI_STRIP_REVERSE:
414 case TOP_PATCHLIST_BASE:
415 case TOP_UNKNOWN:
416 SWR_ASSERT(false, "Unsupported topology: %d", mode);
417 return 0;
418 }
419
420 return 0;
421 }
422
423 //////////////////////////////////////////////////////////////////////////
424 /// @brief Return number of verts per primitive.
425 /// @param topology - topology
426 /// @param includeAdjVerts - include adjacent verts in primitive vertices
427 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
428 {
429 uint32_t numVerts = 0;
430 switch (topology)
431 {
432 case TOP_POINT_LIST:
433 case TOP_POINT_LIST_BF:
434 numVerts = 1;
435 break;
436 case TOP_LINE_LIST:
437 case TOP_LINE_STRIP:
438 case TOP_LINE_LIST_ADJ:
439 case TOP_LINE_LOOP:
440 case TOP_LINE_STRIP_CONT:
441 case TOP_LINE_STRIP_BF:
442 case TOP_LISTSTRIP_ADJ:
443 numVerts = 2;
444 break;
445 case TOP_TRIANGLE_LIST:
446 case TOP_TRIANGLE_STRIP:
447 case TOP_TRIANGLE_FAN:
448 case TOP_TRI_LIST_ADJ:
449 case TOP_TRI_STRIP_ADJ:
450 case TOP_TRI_STRIP_REVERSE:
451 case TOP_RECT_LIST:
452 numVerts = 3;
453 break;
454 case TOP_QUAD_LIST:
455 case TOP_QUAD_STRIP:
456 numVerts = 4;
457 break;
458 case TOP_PATCHLIST_1:
459 case TOP_PATCHLIST_2:
460 case TOP_PATCHLIST_3:
461 case TOP_PATCHLIST_4:
462 case TOP_PATCHLIST_5:
463 case TOP_PATCHLIST_6:
464 case TOP_PATCHLIST_7:
465 case TOP_PATCHLIST_8:
466 case TOP_PATCHLIST_9:
467 case TOP_PATCHLIST_10:
468 case TOP_PATCHLIST_11:
469 case TOP_PATCHLIST_12:
470 case TOP_PATCHLIST_13:
471 case TOP_PATCHLIST_14:
472 case TOP_PATCHLIST_15:
473 case TOP_PATCHLIST_16:
474 case TOP_PATCHLIST_17:
475 case TOP_PATCHLIST_18:
476 case TOP_PATCHLIST_19:
477 case TOP_PATCHLIST_20:
478 case TOP_PATCHLIST_21:
479 case TOP_PATCHLIST_22:
480 case TOP_PATCHLIST_23:
481 case TOP_PATCHLIST_24:
482 case TOP_PATCHLIST_25:
483 case TOP_PATCHLIST_26:
484 case TOP_PATCHLIST_27:
485 case TOP_PATCHLIST_28:
486 case TOP_PATCHLIST_29:
487 case TOP_PATCHLIST_30:
488 case TOP_PATCHLIST_31:
489 case TOP_PATCHLIST_32:
490 numVerts = topology - TOP_PATCHLIST_BASE;
491 break;
492 default:
493 SWR_ASSERT(false, "Unsupported topology: %d", topology);
494 break;
495 }
496
497 if (includeAdjVerts)
498 {
499 switch (topology)
500 {
501 case TOP_LISTSTRIP_ADJ:
502 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
503 case TOP_TRI_STRIP_ADJ:
504 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
505 default: break;
506 }
507 }
508
509 return numVerts;
510 }
511
512 //////////////////////////////////////////////////////////////////////////
513 /// @brief Generate mask from remaining work.
514 /// @param numWorkItems - Number of items being worked on by a SIMD.
515 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
516 {
517 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
518 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
519 return _simd_castps_si(vMask(mask));
520 }
521
522 //////////////////////////////////////////////////////////////////////////
523 /// @brief StreamOut - Streams vertex data out to SO buffers.
524 /// Generally, we are only streaming out a SIMDs worth of triangles.
525 /// @param pDC - pointer to draw context.
526 /// @param workerId - thread's worker id. Even thread has a unique id.
527 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
528 static void StreamOut(
529 DRAW_CONTEXT* pDC,
530 PA_STATE& pa,
531 uint32_t workerId,
532 uint32_t* pPrimData,
533 uint32_t streamIndex)
534 {
535 RDTSC_START(FEStreamout);
536
537 SWR_CONTEXT* pContext = pDC->pContext;
538
539 const API_STATE& state = GetApiState(pDC);
540 const SWR_STREAMOUT_STATE &soState = state.soState;
541
542 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
543
544 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
545 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
546
547 SWR_STREAMOUT_CONTEXT soContext = { 0 };
548
549 // Setup buffer state pointers.
550 for (uint32_t i = 0; i < 4; ++i)
551 {
552 soContext.pBuffer[i] = &state.soBuffer[i];
553 }
554
555 uint32_t numPrims = pa.NumPrims();
556 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
557 {
558 DWORD slot = 0;
559 uint32_t soMask = soState.streamMasks[streamIndex];
560
561 // Write all entries into primitive data buffer for SOS.
562 while (_BitScanForward(&slot, soMask))
563 {
564 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
565 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
566 pa.AssembleSingle(paSlot, primIndex, attrib);
567
568 // Attribute offset is relative offset from start of vertex.
569 // Note that attributes start at slot 1 in the PA buffer. We need to write this
570 // to prim data starting at slot 0. Which is why we do (slot - 1).
571 // Also note: GL works slightly differently, and needs slot 0
572 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
573
574 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
575 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
576 {
577 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
578
579 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
580 }
581 soMask &= ~(1 << slot);
582 }
583
584 // Update pPrimData pointer
585 soContext.pPrimData = pPrimData;
586
587 // Call SOS
588 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
589 state.pfnSoFunc[streamIndex](soContext);
590 }
591
592 // Update SO write offset. The driver provides memory for the update.
593 for (uint32_t i = 0; i < 4; ++i)
594 {
595 if (state.soBuffer[i].pWriteOffset)
596 {
597 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
598
599 // The SOS increments the existing write offset. So we don't want to increment
600 // the SoWriteOffset stat using an absolute offset instead of relative.
601 SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
602 }
603 }
604
605 UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
606 UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
607
608 RDTSC_STOP(FEStreamout, 1, 0);
609 }
610
611 //////////////////////////////////////////////////////////////////////////
612 /// @brief Computes number of invocations. The current index represents
613 /// the start of the SIMD. The max index represents how much work
614 /// items are remaining. If there is less then a SIMD's left of work
615 /// then return the remaining amount of work.
616 /// @param curIndex - The start index for the SIMD.
617 /// @param maxIndex - The last index for all work items.
618 static INLINE uint32_t GetNumInvocations(
619 uint32_t curIndex,
620 uint32_t maxIndex)
621 {
622 uint32_t remainder = (maxIndex - curIndex);
623 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
624 }
625
626 //////////////////////////////////////////////////////////////////////////
627 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
628 /// The geometry shader will loop over each active streamout buffer, assembling
629 /// primitives for the downstream stages. When multistream output is enabled,
630 /// the generated stream ID buffer from the GS needs to be converted to a cut
631 /// buffer for the primitive assembler.
632 /// @param stream - stream id to generate the cut buffer for
633 /// @param pStreamIdBase - pointer to the stream ID buffer
634 /// @param numEmittedVerts - Number of total verts emitted by the GS
635 /// @param pCutBuffer - output buffer to write cuts to
636 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
637 {
638 SWR_ASSERT(stream < MAX_SO_STREAMS);
639
640 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
641 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
642
643 for (uint32_t b = 0; b < numOutputBytes; ++b)
644 {
645 uint8_t curInputByte = pStreamIdBase[2*b];
646 uint8_t outByte = 0;
647 for (uint32_t i = 0; i < 4; ++i)
648 {
649 if ((curInputByte & 0x3) != stream)
650 {
651 outByte |= (1 << i);
652 }
653 curInputByte >>= 2;
654 }
655
656 curInputByte = pStreamIdBase[2 * b + 1];
657 for (uint32_t i = 0; i < 4; ++i)
658 {
659 if ((curInputByte & 0x3) != stream)
660 {
661 outByte |= (1 << (i + 4));
662 }
663 curInputByte >>= 2;
664 }
665
666 *pCutBuffer++ = outByte;
667 }
668 }
669
670 THREAD SWR_GS_CONTEXT tlsGsContext;
671
672 //////////////////////////////////////////////////////////////////////////
673 /// @brief Implements GS stage.
674 /// @param pDC - pointer to draw context.
675 /// @param workerId - thread's worker id. Even thread has a unique id.
676 /// @param pa - The primitive assembly object.
677 /// @param pGsOut - output stream for GS
678 template <
679 typename HasStreamOutT,
680 typename HasRastT>
681 static void GeometryShaderStage(
682 DRAW_CONTEXT *pDC,
683 uint32_t workerId,
684 PA_STATE& pa,
685 void* pGsOut,
686 void* pCutBuffer,
687 void* pStreamCutBuffer,
688 uint32_t* pSoPrimData,
689 simdscalari primID)
690 {
691 RDTSC_START(FEGeometryShader);
692
693 SWR_CONTEXT* pContext = pDC->pContext;
694
695 const API_STATE& state = GetApiState(pDC);
696 const SWR_GS_STATE* pState = &state.gsState;
697
698 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
699 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
700
701 tlsGsContext.pStream = (uint8_t*)pGsOut;
702 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
703 tlsGsContext.PrimitiveID = primID;
704
705 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
706 simdvector attrib[MAX_ATTRIBUTES];
707
708 // assemble all attributes for the input primitive
709 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
710 {
711 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
712 pa.Assemble(attribSlot, attrib);
713
714 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
715 {
716 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
717 }
718 }
719
720 // assemble position
721 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
722 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
723 {
724 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
725 }
726
727 const uint32_t vertexStride = sizeof(simdvertex);
728 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
729 const uint32_t inputPrimStride = numSimdBatches * vertexStride;
730 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
731 uint32_t cutPrimStride;
732 uint32_t cutInstanceStride;
733
734 if (pState->isSingleStream)
735 {
736 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
737 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
738 }
739 else
740 {
741 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
742 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
743 }
744
745 // record valid prims from the frontend to avoid over binning the newly generated
746 // prims from the GS
747 uint32_t numInputPrims = pa.NumPrims();
748
749 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
750 {
751 tlsGsContext.InstanceID = instance;
752 tlsGsContext.mask = GenerateMask(numInputPrims);
753
754 // execute the geometry shader
755 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
756
757 tlsGsContext.pStream += instanceStride;
758 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
759 }
760
761 // set up new binner and state for the GS output topology
762 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
763 if (HasRastT::value)
764 {
765 switch (pState->outputTopology)
766 {
767 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
768 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
769 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
770 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
771 }
772 }
773
774 // foreach input prim:
775 // - setup a new PA based on the emitted verts for that prim
776 // - loop over the new verts, calling PA to assemble each prim
777 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
778 uint32_t* pPrimitiveId = (uint32_t*)&primID;
779
780 uint32_t totalPrimsGenerated = 0;
781 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
782 {
783 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
784 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
785 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
786 {
787 uint32_t numEmittedVerts = pVertexCount[inputPrim];
788 if (numEmittedVerts == 0)
789 {
790 continue;
791 }
792
793 uint8_t* pBase = pInstanceBase + instance * instanceStride;
794 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
795
796 DWORD numAttribs;
797 if (_BitScanReverse(&numAttribs, state.feAttribMask))
798 {
799 numAttribs++;
800 }
801 else
802 {
803 numAttribs = 0;
804 }
805
806 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
807 {
808 bool processCutVerts = false;
809
810 uint8_t* pCutBuffer = pCutBase;
811
812 // assign default stream ID, only relevant when GS is outputting a single stream
813 uint32_t streamID = 0;
814 if (pState->isSingleStream)
815 {
816 processCutVerts = true;
817 streamID = pState->singleStreamID;
818 if (streamID != stream) continue;
819 }
820 else
821 {
822 // early exit if this stream is not enabled for streamout
823 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
824 {
825 continue;
826 }
827
828 // multi-stream output, need to translate StreamID buffer to a cut buffer
829 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
830 pCutBuffer = (uint8_t*)pStreamCutBuffer;
831 processCutVerts = false;
832 }
833
834 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
835
836 while (gsPa.GetNextStreamOutput())
837 {
838 do
839 {
840 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
841
842 if (assemble)
843 {
844 totalPrimsGenerated += gsPa.NumPrims();
845
846 if (HasStreamOutT::value)
847 {
848 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
849 }
850
851 if (HasRastT::value && state.soState.streamToRasterizer == stream)
852 {
853 simdscalari vPrimId;
854 // pull primitiveID from the GS output if available
855 if (state.gsState.emitsPrimitiveID)
856 {
857 simdvector primIdAttrib[3];
858 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
859 vPrimId = _simd_castps_si(primIdAttrib[0].x);
860 }
861 else
862 {
863 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
864 }
865
866 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
867 }
868 }
869 } while (gsPa.NextPrim());
870 }
871 }
872 }
873 }
874
875 // update GS pipeline stats
876 UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
877 UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
878
879 RDTSC_STOP(FEGeometryShader, 1, 0);
880 }
881
882 //////////////////////////////////////////////////////////////////////////
883 /// @brief Allocate GS buffers
884 /// @param pDC - pointer to draw context.
885 /// @param state - API state
886 /// @param ppGsOut - pointer to GS output buffer allocation
887 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
888 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
889 void **ppStreamCutBuffer)
890 {
891 auto pArena = pDC->pArena;
892 SWR_ASSERT(pArena != nullptr);
893 SWR_ASSERT(state.gsState.gsEnable);
894 // allocate arena space to hold GS output verts
895 // @todo pack attribs
896 // @todo support multiple streams
897 const uint32_t vertexStride = sizeof(simdvertex);
898 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
899 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
900 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
901
902 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
903 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
904 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
905 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
906
907 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
908 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
909
910 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
911 if (state.gsState.isSingleStream)
912 {
913 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
914 *ppStreamCutBuffer = nullptr;
915 }
916 else
917 {
918 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
919 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
920 }
921
922 }
923
924 //////////////////////////////////////////////////////////////////////////
925 /// @brief Contains all data generated by the HS and passed to the
926 /// tessellator and DS.
927 struct TessellationThreadLocalData
928 {
929 SWR_HS_CONTEXT hsContext;
930 ScalarPatch patchData[KNOB_SIMD_WIDTH];
931 void* pTxCtx;
932 size_t tsCtxSize;
933
934 simdscalar* pDSOutput;
935 size_t numDSOutputVectors;
936 };
937
938 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
939
940 //////////////////////////////////////////////////////////////////////////
941 /// @brief Allocate tessellation data for this worker thread.
942 INLINE
943 static void AllocateTessellationData(SWR_CONTEXT* pContext)
944 {
945 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
946 if (gt_pTessellationThreadData == nullptr)
947 {
948 gt_pTessellationThreadData = (TessellationThreadLocalData*)
949 _aligned_malloc(sizeof(TessellationThreadLocalData), 64);
950 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
951 }
952 }
953
954 //////////////////////////////////////////////////////////////////////////
955 /// @brief Implements Tessellation Stages.
956 /// @param pDC - pointer to draw context.
957 /// @param workerId - thread's worker id. Even thread has a unique id.
958 /// @param pa - The primitive assembly object.
959 /// @param pGsOut - output stream for GS
960 template <
961 typename HasGeometryShaderT,
962 typename HasStreamOutT,
963 typename HasRastT>
964 static void TessellationStages(
965 DRAW_CONTEXT *pDC,
966 uint32_t workerId,
967 PA_STATE& pa,
968 void* pGsOut,
969 void* pCutBuffer,
970 void* pCutStreamBuffer,
971 uint32_t* pSoPrimData,
972 simdscalari primID)
973 {
974 const API_STATE& state = GetApiState(pDC);
975 const SWR_TS_STATE& tsState = state.tsState;
976 SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
977
978 SWR_ASSERT(gt_pTessellationThreadData);
979
980 HANDLE tsCtx = TSInitCtx(
981 tsState.domain,
982 tsState.partitioning,
983 tsState.tsOutputTopology,
984 gt_pTessellationThreadData->pTxCtx,
985 gt_pTessellationThreadData->tsCtxSize);
986 if (tsCtx == nullptr)
987 {
988 gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64);
989 tsCtx = TSInitCtx(
990 tsState.domain,
991 tsState.partitioning,
992 tsState.tsOutputTopology,
993 gt_pTessellationThreadData->pTxCtx,
994 gt_pTessellationThreadData->tsCtxSize);
995 }
996 SWR_ASSERT(tsCtx);
997
998 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
999 if (HasRastT::value)
1000 {
1001 switch (tsState.postDSTopology)
1002 {
1003 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1004 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1005 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1006 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1007 }
1008 }
1009
1010 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1011 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1012 hsContext.PrimitiveID = primID;
1013
1014 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1015 // Max storage for one attribute for an entire simdprimitive
1016 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1017
1018 // assemble all attributes for the input primitives
1019 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1020 {
1021 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1022 pa.Assemble(attribSlot, simdattrib);
1023
1024 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1025 {
1026 hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1027 }
1028 }
1029
1030 #if defined(_DEBUG)
1031 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1032 #endif
1033
1034 uint32_t numPrims = pa.NumPrims();
1035 hsContext.mask = GenerateMask(numPrims);
1036
1037 // Run the HS
1038 RDTSC_START(FEHullShader);
1039 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1040 RDTSC_STOP(FEHullShader, 0, 0);
1041
1042 UPDATE_STAT(HsInvocations, numPrims);
1043
1044 const uint32_t* pPrimId = (const uint32_t*)&primID;
1045
1046 for (uint32_t p = 0; p < numPrims; ++p)
1047 {
1048 // Run Tessellator
1049 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1050 RDTSC_START(FETessellation);
1051 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1052 RDTSC_STOP(FETessellation, 0, 0);
1053
1054 if (tsData.NumPrimitives == 0)
1055 {
1056 continue;
1057 }
1058 SWR_ASSERT(tsData.NumDomainPoints);
1059
1060 // Allocate DS Output memory
1061 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1062 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1063 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1064 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1065 {
1066 _aligned_free(gt_pTessellationThreadData->pDSOutput);
1067 gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(requiredAllocSize, 64);
1068 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1069 }
1070 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1071 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1072
1073 #if defined(_DEBUG)
1074 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1075 #endif
1076
1077 // Run Domain Shader
1078 SWR_DS_CONTEXT dsContext;
1079 dsContext.PrimitiveID = pPrimId[p];
1080 dsContext.pCpIn = &hsContext.pCPout[p];
1081 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1082 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1083 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1084 dsContext.vectorStride = requiredDSVectorInvocations;
1085
1086 uint32_t dsInvocations = 0;
1087
1088 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1089 {
1090 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1091
1092 RDTSC_START(FEDomainShader);
1093 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1094 RDTSC_STOP(FEDomainShader, 0, 0);
1095
1096 dsInvocations += KNOB_SIMD_WIDTH;
1097 }
1098 UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1099
1100 PA_TESS tessPa(
1101 pDC,
1102 dsContext.pOutputData,
1103 dsContext.vectorStride,
1104 tsState.numDsOutputAttribs,
1105 tsData.ppIndices,
1106 tsData.NumPrimitives,
1107 tsState.postDSTopology);
1108
1109 while (tessPa.HasWork())
1110 {
1111 if (HasGeometryShaderT::value)
1112 {
1113 GeometryShaderStage<HasStreamOutT, HasRastT>(
1114 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1115 _simd_set1_epi32(dsContext.PrimitiveID));
1116 }
1117 else
1118 {
1119 if (HasStreamOutT::value)
1120 {
1121 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1122 }
1123
1124 if (HasRastT::value)
1125 {
1126 simdvector prim[3]; // Only deal with triangles, lines, or points
1127 RDTSC_START(FEPAAssemble);
1128 #if SWR_ENABLE_ASSERTS
1129 bool assemble =
1130 #endif
1131 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1132 RDTSC_STOP(FEPAAssemble, 1, 0);
1133 SWR_ASSERT(assemble);
1134
1135 SWR_ASSERT(pfnClipFunc);
1136 pfnClipFunc(pDC, tessPa, workerId, prim,
1137 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1138 }
1139 }
1140
1141 tessPa.NextPrim();
1142
1143 } // while (tessPa.HasWork())
1144 } // for (uint32_t p = 0; p < numPrims; ++p)
1145
1146 TSDestroyCtx(tsCtx);
1147 }
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief FE handler for SwrDraw.
1151 /// @tparam IsIndexedT - Is indexed drawing enabled
1152 /// @tparam HasTessellationT - Is tessellation enabled
1153 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1154 /// @tparam HasStreamOutT - Is stream-out enabled
1155 /// @tparam HasRastT - Is rasterization enabled
1156 /// @param pContext - pointer to SWR context.
1157 /// @param pDC - pointer to draw context.
1158 /// @param workerId - thread's worker id.
1159 /// @param pUserData - Pointer to DRAW_WORK
1160 template <
1161 typename IsIndexedT,
1162 typename HasTessellationT,
1163 typename HasGeometryShaderT,
1164 typename HasStreamOutT,
1165 typename HasRastT>
1166 void ProcessDraw(
1167 SWR_CONTEXT *pContext,
1168 DRAW_CONTEXT *pDC,
1169 uint32_t workerId,
1170 void *pUserData)
1171 {
1172
1173 #if KNOB_ENABLE_TOSS_POINTS
1174 if (KNOB_TOSS_QUEUE_FE)
1175 {
1176 return;
1177 }
1178 #endif
1179
1180 RDTSC_START(FEProcessDraw);
1181
1182 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1183 const API_STATE& state = GetApiState(pDC);
1184 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1185 SWR_VS_CONTEXT vsContext;
1186 simdvertex vin;
1187
1188 int indexSize = 0;
1189 uint32_t endVertex = work.numVerts;
1190
1191 const int32_t* pLastRequestedIndex = nullptr;
1192 if (IsIndexedT::value)
1193 {
1194 switch (work.type)
1195 {
1196 case R32_UINT:
1197 indexSize = sizeof(uint32_t);
1198 pLastRequestedIndex = &(work.pIB[endVertex]);
1199 break;
1200 case R16_UINT:
1201 indexSize = sizeof(uint16_t);
1202 // nasty address offset to last index
1203 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1204 break;
1205 case R8_UINT:
1206 indexSize = sizeof(uint8_t);
1207 // nasty address offset to last index
1208 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1209 break;
1210 default:
1211 SWR_ASSERT(0);
1212 }
1213 }
1214 else
1215 {
1216 // No cuts, prune partial primitives.
1217 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1218 }
1219
1220 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1221 fetchInfo.pStreams = &state.vertexBuffers[0];
1222 fetchInfo.StartInstance = work.startInstance;
1223 fetchInfo.StartVertex = 0;
1224
1225 vsContext.pVin = &vin;
1226
1227 if (IsIndexedT::value)
1228 {
1229 fetchInfo.BaseVertex = work.baseVertex;
1230
1231 // if the entire index buffer isn't being consumed, set the last index
1232 // so that fetches < a SIMD wide will be masked off
1233 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1234 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1235 {
1236 fetchInfo.pLastIndex = pLastRequestedIndex;
1237 }
1238 }
1239 else
1240 {
1241 fetchInfo.StartVertex = work.startVertex;
1242 }
1243
1244 #ifdef KNOB_ENABLE_RDTSC
1245 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1246 #endif
1247
1248 void* pGsOut = nullptr;
1249 void* pCutBuffer = nullptr;
1250 void* pStreamCutBuffer = nullptr;
1251 if (HasGeometryShaderT::value)
1252 {
1253 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1254 }
1255
1256 if (HasTessellationT::value)
1257 {
1258 SWR_ASSERT(state.tsState.tsEnable == true);
1259 SWR_ASSERT(state.pfnHsFunc != nullptr);
1260 SWR_ASSERT(state.pfnDsFunc != nullptr);
1261
1262 AllocateTessellationData(pContext);
1263 }
1264 else
1265 {
1266 SWR_ASSERT(state.tsState.tsEnable == false);
1267 SWR_ASSERT(state.pfnHsFunc == nullptr);
1268 SWR_ASSERT(state.pfnDsFunc == nullptr);
1269 }
1270
1271 // allocate space for streamout input prim data
1272 uint32_t* pSoPrimData = nullptr;
1273 if (HasStreamOutT::value)
1274 {
1275 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1276
1277 // update the
1278 for (uint32_t i = 0; i < 4; ++i)
1279 {
1280 SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
1281 }
1282
1283 }
1284
1285 // choose primitive assembler
1286 PA_FACTORY<IsIndexedT> paFactory(pDC, state.topology, work.numVerts);
1287 PA_STATE& pa = paFactory.GetPA();
1288
1289 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1290 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1291 {
1292 simdscalari vIndex;
1293 uint32_t i = 0;
1294
1295 if (IsIndexedT::value)
1296 {
1297 fetchInfo.pIndices = work.pIB;
1298 }
1299 else
1300 {
1301 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1302 fetchInfo.pIndices = (const int32_t*)&vIndex;
1303 }
1304
1305 fetchInfo.CurInstance = instanceNum;
1306 vsContext.InstanceID = instanceNum;
1307
1308 while (pa.HasWork())
1309 {
1310 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1311 // So we need to keep this outside of (i < endVertex) check.
1312 simdmask* pvCutIndices = nullptr;
1313 if (IsIndexedT::value)
1314 {
1315 pvCutIndices = &pa.GetNextVsIndices();
1316 }
1317
1318 simdvertex& vout = pa.GetNextVsOutput();
1319 vsContext.pVout = &vout;
1320
1321 if (i < endVertex)
1322 {
1323
1324 // 1. Execute FS/VS for a single SIMD.
1325 RDTSC_START(FEFetchShader);
1326 state.pfnFetchFunc(fetchInfo, vin);
1327 RDTSC_STOP(FEFetchShader, 0, 0);
1328
1329 // forward fetch generated vertex IDs to the vertex shader
1330 vsContext.VertexID = fetchInfo.VertexID;
1331
1332 // Setup active mask for vertex shader.
1333 vsContext.mask = GenerateMask(endVertex - i);
1334
1335 // forward cut mask to the PA
1336 if (IsIndexedT::value)
1337 {
1338 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1339 }
1340
1341 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1342
1343 #if KNOB_ENABLE_TOSS_POINTS
1344 if (!KNOB_TOSS_FETCH)
1345 #endif
1346 {
1347 RDTSC_START(FEVertexShader);
1348 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1349 RDTSC_STOP(FEVertexShader, 0, 0);
1350
1351 UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1352 }
1353 }
1354
1355 // 2. Assemble primitives given the last two SIMD.
1356 do
1357 {
1358 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1359 // PaAssemble returns false if there is not enough verts to assemble.
1360 RDTSC_START(FEPAAssemble);
1361 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1362 RDTSC_STOP(FEPAAssemble, 1, 0);
1363
1364 #if KNOB_ENABLE_TOSS_POINTS
1365 if (!KNOB_TOSS_FETCH)
1366 #endif
1367 {
1368 #if KNOB_ENABLE_TOSS_POINTS
1369 if (!KNOB_TOSS_VS)
1370 #endif
1371 {
1372 if (assemble)
1373 {
1374 UPDATE_STAT(IaPrimitives, pa.NumPrims());
1375
1376 if (HasTessellationT::value)
1377 {
1378 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1379 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1380 }
1381 else if (HasGeometryShaderT::value)
1382 {
1383 GeometryShaderStage<HasStreamOutT, HasRastT>(
1384 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1385 }
1386 else
1387 {
1388 // If streamout is enabled then stream vertices out to memory.
1389 if (HasStreamOutT::value)
1390 {
1391 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1392 }
1393
1394 if (HasRastT::value)
1395 {
1396 SWR_ASSERT(pDC->pState->pfnProcessPrims);
1397 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1398 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1399 }
1400 }
1401 }
1402 }
1403 }
1404 } while (pa.NextPrim());
1405
1406 i += KNOB_SIMD_WIDTH;
1407 if (IsIndexedT::value)
1408 {
1409 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1410 }
1411 else
1412 {
1413 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1414 }
1415 }
1416 pa.Reset();
1417 }
1418
1419 RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1420 }
1421
1422 struct FEDrawChooser
1423 {
1424 typedef PFN_FE_WORK_FUNC FuncType;
1425
1426 template <typename... ArgsB>
1427 static FuncType GetFunc()
1428 {
1429 return ProcessDraw<ArgsB...>;
1430 }
1431 };
1432
1433
1434 // Selector for correct templated Draw front-end function
1435 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1436 bool IsIndexed,
1437 bool HasTessellation,
1438 bool HasGeometryShader,
1439 bool HasStreamOut,
1440 bool HasRasterization)
1441 {
1442 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1443 }
1444
1445
1446 //////////////////////////////////////////////////////////////////////////
1447 /// @brief Processes attributes for the backend based on linkage mask and
1448 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1449 /// @param pDC - Draw context
1450 /// @param pa - Primitive Assembly state
1451 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1452 /// @param pLinkageMap - maps VS attribute slot to PS slot
1453 /// @param triIndex - Triangle to process attributes for
1454 /// @param pBuffer - Output result
1455 template<uint32_t NumVerts>
1456 INLINE void ProcessAttributes(
1457 DRAW_CONTEXT *pDC,
1458 PA_STATE&pa,
1459 uint32_t linkageMask,
1460 const uint8_t* pLinkageMap,
1461 uint32_t triIndex,
1462 float *pBuffer)
1463 {
1464 DWORD slot = 0;
1465 uint32_t mapIdx = 0;
1466 LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
1467 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1468
1469 while (_BitScanForward(&slot, linkageMask))
1470 {
1471 linkageMask &= ~(1 << slot); // done with this bit.
1472
1473 // compute absolute slot in vertex attrib array
1474 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
1475
1476 __m128 attrib[3]; // triangle attribs (always 4 wide)
1477 pa.AssembleSingle(inputSlot, triIndex, attrib);
1478
1479 if (_bittest(&constantInterpMask, mapIdx))
1480 {
1481 for (uint32_t i = 0; i < NumVerts; ++i)
1482 {
1483 _mm_store_ps(pBuffer, attrib[provokingVertex]);
1484 pBuffer += 4;
1485 }
1486 }
1487 else
1488 {
1489 for (uint32_t i = 0; i < NumVerts; ++i)
1490 {
1491 _mm_store_ps(pBuffer, attrib[i]);
1492 pBuffer += 4;
1493 }
1494 }
1495
1496 // pad out the attrib buffer to 3 verts to ensure the triangle
1497 // interpolation code in the pixel shader works correctly for the
1498 // 3 topologies - point, line, tri. This effectively zeros out the
1499 // effect of the missing vertices in the triangle interpolation.
1500 for (uint32_t i = NumVerts; i < 3; ++i)
1501 {
1502 _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
1503 pBuffer += 4;
1504 }
1505
1506 mapIdx++;
1507 }
1508 }
1509
1510 //////////////////////////////////////////////////////////////////////////
1511 /// @brief Processes enabled user clip distances. Loads the active clip
1512 /// distances from the PA, sets up barycentric equations, and
1513 /// stores the results to the output buffer
1514 /// @param pa - Primitive Assembly state
1515 /// @param primIndex - primitive index to process
1516 /// @param clipDistMask - mask of enabled clip distances
1517 /// @param pUserClipBuffer - buffer to store results
1518 template<uint32_t NumVerts>
1519 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1520 {
1521 DWORD clipDist;
1522 while (_BitScanForward(&clipDist, clipDistMask))
1523 {
1524 clipDistMask &= ~(1 << clipDist);
1525 uint32_t clipSlot = clipDist >> 2;
1526 uint32_t clipComp = clipDist & 0x3;
1527 uint32_t clipAttribSlot = clipSlot == 0 ?
1528 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1529
1530 __m128 primClipDist[3];
1531 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1532
1533 float vertClipDist[NumVerts];
1534 for (uint32_t e = 0; e < NumVerts; ++e)
1535 {
1536 OSALIGNSIMD(float) aVertClipDist[4];
1537 _mm_store_ps(aVertClipDist, primClipDist[e]);
1538 vertClipDist[e] = aVertClipDist[clipComp];
1539 };
1540
1541 // setup plane equations for barycentric interpolation in the backend
1542 float baryCoeff[NumVerts];
1543 for (uint32_t e = 0; e < NumVerts - 1; ++e)
1544 {
1545 baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1546 }
1547 baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1548
1549 for (uint32_t e = 0; e < NumVerts; ++e)
1550 {
1551 *(pUserClipBuffer++) = baryCoeff[e];
1552 }
1553 }
1554 }
1555
1556 //////////////////////////////////////////////////////////////////////////
1557 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1558 /// culling, viewport transform, etc.
1559 /// @param pDC - pointer to draw context.
1560 /// @param pa - The primitive assembly object.
1561 /// @param workerId - thread's worker id. Even thread has a unique id.
1562 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1563 /// @param primID - Primitive ID for each triangle.
1564 void BinTriangles(
1565 DRAW_CONTEXT *pDC,
1566 PA_STATE& pa,
1567 uint32_t workerId,
1568 simdvector tri[3],
1569 uint32_t triMask,
1570 simdscalari primID)
1571 {
1572 RDTSC_START(FEBinTriangles);
1573
1574 const API_STATE& state = GetApiState(pDC);
1575 const SWR_RASTSTATE& rastState = state.rastState;
1576 const SWR_FRONTEND_STATE& feState = state.frontendState;
1577 const SWR_GS_STATE& gsState = state.gsState;
1578
1579 // Simple wireframe mode for debugging purposes only
1580
1581 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1582 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1583 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1584
1585 if (!feState.vpTransformDisable)
1586 {
1587 // perspective divide
1588 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1589 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1590 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1591
1592 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1593 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1594 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1595
1596 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1597 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1598 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1599
1600 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1601 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1602 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1603
1604 // viewport transform to screen coords
1605 viewportTransform<3>(tri, state.vpMatrix[0]);
1606 }
1607
1608 // adjust for pixel center location
1609 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1610 tri[0].x = _simd_add_ps(tri[0].x, offset);
1611 tri[0].y = _simd_add_ps(tri[0].y, offset);
1612
1613 tri[1].x = _simd_add_ps(tri[1].x, offset);
1614 tri[1].y = _simd_add_ps(tri[1].y, offset);
1615
1616 tri[2].x = _simd_add_ps(tri[2].x, offset);
1617 tri[2].y = _simd_add_ps(tri[2].y, offset);
1618
1619 // convert to fixed point
1620 simdscalari vXi[3], vYi[3];
1621 vXi[0] = fpToFixedPointVertical(tri[0].x);
1622 vYi[0] = fpToFixedPointVertical(tri[0].y);
1623 vXi[1] = fpToFixedPointVertical(tri[1].x);
1624 vYi[1] = fpToFixedPointVertical(tri[1].y);
1625 vXi[2] = fpToFixedPointVertical(tri[2].x);
1626 vYi[2] = fpToFixedPointVertical(tri[2].y);
1627
1628 // triangle setup
1629 simdscalari vAi[3], vBi[3];
1630 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1631
1632 // determinant
1633 simdscalari vDet[2];
1634 calcDeterminantIntVertical(vAi, vBi, vDet);
1635
1636 // cull zero area
1637 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1638 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1639
1640 int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2));
1641
1642 uint32_t origTriMask = triMask;
1643 triMask &= ~cullZeroAreaMask;
1644
1645 // determine front winding tris
1646 // CW +det
1647 // CCW -det
1648 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1649 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1650 int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1651
1652 uint32_t frontWindingTris;
1653 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1654 {
1655 frontWindingTris = cwTriMask;
1656 }
1657 else
1658 {
1659 frontWindingTris = ~cwTriMask;
1660 }
1661
1662 // cull
1663 uint32_t cullTris;
1664 switch ((SWR_CULLMODE)rastState.cullMode)
1665 {
1666 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1667 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1668 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1669 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1670 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1671 }
1672
1673 triMask &= ~cullTris;
1674
1675 if (origTriMask ^ triMask)
1676 {
1677 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1678 }
1679
1680 // compute per tri backface
1681 uint32_t frontFaceMask = frontWindingTris;
1682
1683 uint32_t *pPrimID = (uint32_t *)&primID;
1684 DWORD triIndex = 0;
1685
1686 if (!triMask)
1687 {
1688 goto endBinTriangles;
1689 }
1690
1691 // Calc bounding box of triangles
1692 simdBBox bbox;
1693 calcBoundingBoxIntVertical(vXi, vYi, bbox);
1694
1695 // determine if triangle falls between pixel centers and discard
1696 // only discard for non-MSAA case
1697 // (left + 127) & ~255
1698 // (right + 128) & ~255
1699
1700 if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
1701 {
1702 origTriMask = triMask;
1703
1704 int cullCenterMask;
1705 {
1706 simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1707 left = _simd_and_si(left, _simd_set1_epi32(~255));
1708 simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1709 right = _simd_and_si(right, _simd_set1_epi32(~255));
1710
1711 simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1712
1713 simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1714 top = _simd_and_si(top, _simd_set1_epi32(~255));
1715 simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1716 bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1717
1718 simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1719 vMaskV = _simd_or_si(vMaskH, vMaskV);
1720 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1721 }
1722
1723 triMask &= ~cullCenterMask;
1724
1725 if(origTriMask ^ triMask)
1726 {
1727 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1728 }
1729 }
1730
1731 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1732 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1733 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1734 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1735 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1736
1737 // Cull tris completely outside scissor
1738 {
1739 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1740 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1741 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1742 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1743 triMask = triMask & ~maskOutsideScissor;
1744 }
1745
1746 if (!triMask)
1747 {
1748 goto endBinTriangles;
1749 }
1750
1751 // Convert triangle bbox to macrotile units.
1752 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1753 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1754 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1755 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1756
1757 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1758 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
1759 _simd_store_si((simdscalari*)aMTRight, bbox.right);
1760 _simd_store_si((simdscalari*)aMTTop, bbox.top);
1761 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
1762
1763 // transpose verts needed for backend
1764 /// @todo modify BE to take non-transformed verts
1765 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1766 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1767 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1768 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1769 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1770
1771 // store render target array index
1772 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1773 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1774 {
1775 simdvector vRtai[3];
1776 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1777 simdscalari vRtaii;
1778 vRtaii = _simd_castps_si(vRtai[0].x);
1779 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1780 }
1781 else
1782 {
1783 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1784 }
1785
1786 // scan remaining valid triangles and bin each separately
1787 while (_BitScanForward(&triIndex, triMask))
1788 {
1789 uint32_t linkageCount = state.linkageCount;
1790 uint32_t linkageMask = state.linkageMask;
1791 uint32_t numScalarAttribs = linkageCount * 4;
1792
1793 BE_WORK work;
1794 work.type = DRAW;
1795
1796 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1797
1798 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1799 desc.triFlags.primID = pPrimID[triIndex];
1800 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1801
1802 if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
1803 {
1804 work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
1805 }
1806 else
1807 {
1808 // for center sample pattern, all samples are at pixel center; calculate coverage
1809 // once at center and broadcast the results in the backend
1810 work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
1811 }
1812
1813 auto pArena = pDC->pArena;
1814 SWR_ASSERT(pArena != nullptr);
1815
1816 // store active attribs
1817 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1818 desc.pAttribs = pAttribs;
1819 desc.numAttribs = linkageCount;
1820 ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
1821
1822 // store triangle vertex data
1823 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1824
1825 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1826 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1827 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1828 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1829
1830 // store user clip distances
1831 if (rastState.clipDistanceMask)
1832 {
1833 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1834 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1835 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
1836 }
1837
1838 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1839 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1840 {
1841 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1842 {
1843 #if KNOB_ENABLE_TOSS_POINTS
1844 if (!KNOB_TOSS_SETUP_TRIS)
1845 #endif
1846 {
1847 pTileMgr->enqueue(x, y, &work);
1848 }
1849 }
1850 }
1851 triMask &= ~(1 << triIndex);
1852 }
1853
1854 endBinTriangles:
1855 RDTSC_STOP(FEBinTriangles, 1, 0);
1856 }
1857
1858
1859
1860 //////////////////////////////////////////////////////////////////////////
1861 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1862 /// @param pDC - pointer to draw context.
1863 /// @param pa - The primitive assembly object.
1864 /// @param workerId - thread's worker id. Even thread has a unique id.
1865 /// @param tri - Contains point position data for SIMDs worth of points.
1866 /// @param primID - Primitive ID for each point.
1867 void BinPoints(
1868 DRAW_CONTEXT *pDC,
1869 PA_STATE& pa,
1870 uint32_t workerId,
1871 simdvector prim[3],
1872 uint32_t primMask,
1873 simdscalari primID)
1874 {
1875 RDTSC_START(FEBinPoints);
1876
1877 simdvector& primVerts = prim[0];
1878
1879 const API_STATE& state = GetApiState(pDC);
1880 const SWR_FRONTEND_STATE& feState = state.frontendState;
1881 const SWR_GS_STATE& gsState = state.gsState;
1882 const SWR_RASTSTATE& rastState = state.rastState;
1883
1884 if (!feState.vpTransformDisable)
1885 {
1886 // perspective divide
1887 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1888 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1889 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1890 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1891
1892 // viewport transform to screen coords
1893 viewportTransform<1>(&primVerts, state.vpMatrix[0]);
1894 }
1895
1896 // adjust for pixel center location
1897 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1898 primVerts.x = _simd_add_ps(primVerts.x, offset);
1899 primVerts.y = _simd_add_ps(primVerts.y, offset);
1900
1901 // convert to fixed point
1902 simdscalari vXi, vYi;
1903 vXi = fpToFixedPointVertical(primVerts.x);
1904 vYi = fpToFixedPointVertical(primVerts.y);
1905
1906 if (CanUseSimplePoints(pDC))
1907 {
1908 // adjust for top-left rule
1909 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1910 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1911
1912 // cull points off the top-left edge of the viewport
1913 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1914 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1915
1916 // compute macro tile coordinates
1917 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1918 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1919
1920 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1921 _simd_store_si((simdscalari*)aMacroX, macroX);
1922 _simd_store_si((simdscalari*)aMacroY, macroY);
1923
1924 // compute raster tile coordinates
1925 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1926 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1927
1928 // compute raster tile relative x,y for coverage mask
1929 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1930 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1931
1932 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1933 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1934
1935 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1936 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1937 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1938 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1939
1940 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1941 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1942 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1943 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1944
1945 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1946 _simd_store_ps((float*)aZ, primVerts.z);
1947
1948 // store render target array index
1949 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1950 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1951 {
1952 simdvector vRtai;
1953 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
1954 simdscalari vRtaii = _simd_castps_si(vRtai.x);
1955 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1956 }
1957 else
1958 {
1959 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1960 }
1961
1962 uint32_t *pPrimID = (uint32_t *)&primID;
1963 DWORD primIndex = 0;
1964 // scan remaining valid triangles and bin each separately
1965 while (_BitScanForward(&primIndex, primMask))
1966 {
1967 uint32_t linkageCount = state.linkageCount;
1968 uint32_t linkageMask = state.linkageMask;
1969
1970 uint32_t numScalarAttribs = linkageCount * 4;
1971
1972 BE_WORK work;
1973 work.type = DRAW;
1974
1975 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1976
1977 // points are always front facing
1978 desc.triFlags.frontFacing = 1;
1979 desc.triFlags.primID = pPrimID[primIndex];
1980 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1981
1982 work.pfnWork = RasterizeSimplePoint;
1983
1984 auto pArena = pDC->pArena;
1985 SWR_ASSERT(pArena != nullptr);
1986
1987 // store attributes
1988 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1989 desc.pAttribs = pAttribs;
1990 desc.numAttribs = linkageCount;
1991
1992 ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
1993
1994 // store raster tile aligned x, y, perspective correct z
1995 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1996 desc.pTriBuffer = pTriBuffer;
1997 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1998 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1999 *pTriBuffer = aZ[primIndex];
2000
2001 uint32_t tX = aTileRelativeX[primIndex];
2002 uint32_t tY = aTileRelativeY[primIndex];
2003
2004 // pack the relative x,y into the coverageMask, the rasterizer will
2005 // generate the true coverage mask from it
2006 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2007
2008 // bin it
2009 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2010 #if KNOB_ENABLE_TOSS_POINTS
2011 if (!KNOB_TOSS_SETUP_TRIS)
2012 #endif
2013 {
2014 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2015 }
2016 primMask &= ~(1 << primIndex);
2017 }
2018 }
2019 else
2020 {
2021 // non simple points need to be potentially binned to multiple macro tiles
2022 simdscalar vPointSize;
2023 if (rastState.pointParam)
2024 {
2025 simdvector size[3];
2026 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2027 vPointSize = size[0].x;
2028 }
2029 else
2030 {
2031 vPointSize = _simd_set1_ps(rastState.pointSize);
2032 }
2033
2034 // bloat point to bbox
2035 simdBBox bbox;
2036 bbox.left = bbox.right = vXi;
2037 bbox.top = bbox.bottom = vYi;
2038
2039 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2040 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2041 bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2042 bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2043 bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2044 bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2045
2046 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2047 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2048 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2049 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2050 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2051
2052 // Cull bloated points completely outside scissor
2053 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2054 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2055 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2056 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2057 primMask = primMask & ~maskOutsideScissor;
2058
2059 // Convert bbox to macrotile units.
2060 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2061 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2062 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2063 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2064
2065 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2066 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2067 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2068 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2069 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2070
2071 // store render target array index
2072 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2073 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2074 {
2075 simdvector vRtai[2];
2076 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2077 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2078 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2079 }
2080 else
2081 {
2082 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2083 }
2084
2085 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2086 _simd_store_ps((float*)aPointSize, vPointSize);
2087
2088 uint32_t *pPrimID = (uint32_t *)&primID;
2089
2090 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2091 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2092 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2093
2094 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2095 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2096 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2097
2098 // scan remaining valid prims and bin each separately
2099 DWORD primIndex;
2100 while (_BitScanForward(&primIndex, primMask))
2101 {
2102 uint32_t linkageCount = state.linkageCount;
2103 uint32_t linkageMask = state.linkageMask;
2104 uint32_t numScalarAttribs = linkageCount * 4;
2105
2106 BE_WORK work;
2107 work.type = DRAW;
2108
2109 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2110
2111 desc.triFlags.frontFacing = 1;
2112 desc.triFlags.primID = pPrimID[primIndex];
2113 desc.triFlags.pointSize = aPointSize[primIndex];
2114 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2115
2116 work.pfnWork = RasterizeTriPoint;
2117
2118 auto pArena = pDC->pArena;
2119 SWR_ASSERT(pArena != nullptr);
2120
2121 // store active attribs
2122 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2123 desc.numAttribs = linkageCount;
2124 ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2125
2126 // store point vertex data
2127 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2128 desc.pTriBuffer = pTriBuffer;
2129 *pTriBuffer++ = aPrimVertsX[primIndex];
2130 *pTriBuffer++ = aPrimVertsY[primIndex];
2131 *pTriBuffer = aPrimVertsZ[primIndex];
2132
2133 // store user clip distances
2134 if (rastState.clipDistanceMask)
2135 {
2136 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2137 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2138 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2139 }
2140
2141 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2142 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2143 {
2144 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2145 {
2146 #if KNOB_ENABLE_TOSS_POINTS
2147 if (!KNOB_TOSS_SETUP_TRIS)
2148 #endif
2149 {
2150 pTileMgr->enqueue(x, y, &work);
2151 }
2152 }
2153 }
2154
2155 primMask &= ~(1 << primIndex);
2156 }
2157 }
2158
2159
2160
2161
2162 RDTSC_STOP(FEBinPoints, 1, 0);
2163 }
2164
2165 //////////////////////////////////////////////////////////////////////////
2166 /// @brief Bin SIMD lines to the backend.
2167 /// @param pDC - pointer to draw context.
2168 /// @param pa - The primitive assembly object.
2169 /// @param workerId - thread's worker id. Even thread has a unique id.
2170 /// @param tri - Contains line position data for SIMDs worth of points.
2171 /// @param primID - Primitive ID for each line.
2172 void BinLines(
2173 DRAW_CONTEXT *pDC,
2174 PA_STATE& pa,
2175 uint32_t workerId,
2176 simdvector prim[],
2177 uint32_t primMask,
2178 simdscalari primID)
2179 {
2180 RDTSC_START(FEBinLines);
2181
2182 const API_STATE& state = GetApiState(pDC);
2183 const SWR_RASTSTATE& rastState = state.rastState;
2184 const SWR_FRONTEND_STATE& feState = state.frontendState;
2185 const SWR_GS_STATE& gsState = state.gsState;
2186
2187 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2188 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2189
2190 if (!feState.vpTransformDisable)
2191 {
2192 // perspective divide
2193 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2194 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2195
2196 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2197 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2198
2199 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2200 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2201
2202 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2203 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2204
2205 // viewport transform to screen coords
2206 viewportTransform<2>(prim, state.vpMatrix[0]);
2207 }
2208
2209 // adjust for pixel center location
2210 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2211 prim[0].x = _simd_add_ps(prim[0].x, offset);
2212 prim[0].y = _simd_add_ps(prim[0].y, offset);
2213
2214 prim[1].x = _simd_add_ps(prim[1].x, offset);
2215 prim[1].y = _simd_add_ps(prim[1].y, offset);
2216
2217 // convert to fixed point
2218 simdscalari vXi[2], vYi[2];
2219 vXi[0] = fpToFixedPointVertical(prim[0].x);
2220 vYi[0] = fpToFixedPointVertical(prim[0].y);
2221 vXi[1] = fpToFixedPointVertical(prim[1].x);
2222 vYi[1] = fpToFixedPointVertical(prim[1].y);
2223
2224 // compute x-major vs y-major mask
2225 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2226 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2227 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2228 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2229
2230 // cull zero-length lines
2231 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2232 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2233
2234 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2235
2236 uint32_t *pPrimID = (uint32_t *)&primID;
2237
2238 simdscalar vUnused = _simd_setzero_ps();
2239
2240 // Calc bounding box of lines
2241 simdBBox bbox;
2242 bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2243 bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2244 bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2245 bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2246
2247 // bloat bbox by line width along minor axis
2248 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2249 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2250 simdBBox bloatBox;
2251 bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2252 bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2253 bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2254 bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2255
2256 bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2257 bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2258 bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2259 bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2260
2261 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2262 bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2263 bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2264 bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2265 bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2266
2267 // Cull prims completely outside scissor
2268 {
2269 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2270 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2271 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2272 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2273 primMask = primMask & ~maskOutsideScissor;
2274 }
2275
2276 if (!primMask)
2277 {
2278 goto endBinLines;
2279 }
2280
2281 // Convert triangle bbox to macrotile units.
2282 bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2283 bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2284 bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2285 bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2286
2287 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2288 _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2289 _simd_store_si((simdscalari*)aMTRight, bbox.right);
2290 _simd_store_si((simdscalari*)aMTTop, bbox.top);
2291 _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2292
2293 // transpose verts needed for backend
2294 /// @todo modify BE to take non-transformed verts
2295 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2296 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2297 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2298 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2299 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2300
2301 // store render target array index
2302 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2303 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2304 {
2305 simdvector vRtai[2];
2306 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2307 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2308 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2309 }
2310 else
2311 {
2312 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2313 }
2314
2315 // scan remaining valid prims and bin each separately
2316 DWORD primIndex;
2317 while (_BitScanForward(&primIndex, primMask))
2318 {
2319 uint32_t linkageCount = state.linkageCount;
2320 uint32_t linkageMask = state.linkageMask;
2321 uint32_t numScalarAttribs = linkageCount * 4;
2322
2323 BE_WORK work;
2324 work.type = DRAW;
2325
2326 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2327
2328 desc.triFlags.frontFacing = 1;
2329 desc.triFlags.primID = pPrimID[primIndex];
2330 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2331 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2332
2333 work.pfnWork = RasterizeLine;
2334
2335 auto pArena = pDC->pArena;
2336 SWR_ASSERT(pArena != nullptr);
2337
2338 // store active attribs
2339 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2340 desc.numAttribs = linkageCount;
2341 ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2342
2343 // store line vertex data
2344 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2345 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2346 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2347 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2348 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2349
2350 // store user clip distances
2351 if (rastState.clipDistanceMask)
2352 {
2353 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2354 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2355 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2356 }
2357
2358 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2359 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2360 {
2361 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2362 {
2363 #if KNOB_ENABLE_TOSS_POINTS
2364 if (!KNOB_TOSS_SETUP_TRIS)
2365 #endif
2366 {
2367 pTileMgr->enqueue(x, y, &work);
2368 }
2369 }
2370 }
2371
2372 primMask &= ~(1 << primIndex);
2373 }
2374
2375 endBinLines:
2376
2377 RDTSC_STOP(FEBinLines, 1, 0);
2378 }