swr/rast: code cleanup (no functional change)
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
40 void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
41
42 #if USE_SIMD16_FRONTEND
43 void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
44 void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
45 #endif
46
47 //////////////////////////////////////////////////////////////////////////
48 /// @brief Processes attributes for the backend based on linkage mask and
49 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
50 /// @param pDC - Draw context
51 /// @param pa - Primitive Assembly state
52 /// @param linkageMask - Specifies which VS outputs are routed to PS.
53 /// @param pLinkageMap - maps VS attribute slot to PS slot
54 /// @param triIndex - Triangle to process attributes for
55 /// @param pBuffer - Output result
56 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
57 INLINE void ProcessAttributes(
58 DRAW_CONTEXT *pDC,
59 PA_STATE&pa,
60 uint32_t triIndex,
61 uint32_t primId,
62 float *pBuffer)
63 {
64 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
65 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
66 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
67 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
68 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
69 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
70
71 static const float constTable[3][4] = {
72 { 0.0f, 0.0f, 0.0f, 0.0f },
73 { 0.0f, 0.0f, 0.0f, 1.0f },
74 { 1.0f, 1.0f, 1.0f, 1.0f }
75 };
76
77 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
78 {
79 uint32_t inputSlot;
80 if (IsSwizzledT::value)
81 {
82 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
83 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
84
85 }
86 else
87 {
88 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
89 }
90
91 __m128 attrib[3]; // triangle attribs (always 4 wide)
92 float* pAttribStart = pBuffer;
93
94 if (HasConstantInterpT::value || IsDegenerate::value)
95 {
96 if (_bittest(&constantInterpMask, i))
97 {
98 uint32_t vid;
99 uint32_t adjustedTriIndex;
100 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
101 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
102 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
103 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
104 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
105
106 switch (topo) {
107 case TOP_QUAD_LIST:
108 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
109 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
110 break;
111 case TOP_QUAD_STRIP:
112 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
113 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
114 break;
115 case TOP_TRIANGLE_STRIP:
116 adjustedTriIndex = triIndex;
117 vid = (triIndex & 1)
118 ? tristripProvokingVertex[provokingVertex]
119 : provokingVertex;
120 break;
121 default:
122 adjustedTriIndex = triIndex;
123 vid = provokingVertex;
124 break;
125 }
126
127 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
128
129 for (uint32_t i = 0; i < NumVertsT::value; ++i)
130 {
131 _mm_store_ps(pBuffer, attrib[vid]);
132 pBuffer += 4;
133 }
134 }
135 else
136 {
137 pa.AssembleSingle(inputSlot, triIndex, attrib);
138
139 for (uint32_t i = 0; i < NumVertsT::value; ++i)
140 {
141 _mm_store_ps(pBuffer, attrib[i]);
142 pBuffer += 4;
143 }
144 }
145 }
146 else
147 {
148 pa.AssembleSingle(inputSlot, triIndex, attrib);
149
150 for (uint32_t i = 0; i < NumVertsT::value; ++i)
151 {
152 _mm_store_ps(pBuffer, attrib[i]);
153 pBuffer += 4;
154 }
155 }
156
157 // pad out the attrib buffer to 3 verts to ensure the triangle
158 // interpolation code in the pixel shader works correctly for the
159 // 3 topologies - point, line, tri. This effectively zeros out the
160 // effect of the missing vertices in the triangle interpolation.
161 for (uint32_t v = NumVertsT::value; v < 3; ++v)
162 {
163 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
164 pBuffer += 4;
165 }
166
167 // check for constant source overrides
168 if (IsSwizzledT::value)
169 {
170 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
171 if (mask)
172 {
173 DWORD comp;
174 while (_BitScanForward(&comp, mask))
175 {
176 mask &= ~(1 << comp);
177
178 float constantValue = 0.0f;
179 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
180 {
181 case SWR_CONSTANT_SOURCE_CONST_0000:
182 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
183 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
184 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
185 break;
186 case SWR_CONSTANT_SOURCE_PRIM_ID:
187 constantValue = *(float*)&primId;
188 break;
189 }
190
191 // apply constant value to all 3 vertices
192 for (uint32_t v = 0; v < 3; ++v)
193 {
194 pAttribStart[comp + v * 4] = constantValue;
195 }
196 }
197 }
198 }
199 }
200 }
201
202 //////////////////////////////////////////////////////////////////////////
203 /// @brief Gather scissor rect data based on per-prim viewport indices.
204 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
205 /// @param pViewportIndex - array of per-primitive vewport indexes.
206 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
207 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
208 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
209 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
210 //
211 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
212 template<size_t SimdWidth>
213 struct GatherScissors
214 {
215 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
216 simdscalari &scisXmin, simdscalari &scisYmin,
217 simdscalari &scisXmax, simdscalari &scisYmax)
218 {
219 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
220 }
221 };
222
223 template<>
224 struct GatherScissors<8>
225 {
226 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
227 simdscalari &scisXmin, simdscalari &scisYmin,
228 simdscalari &scisXmax, simdscalari &scisYmax)
229 {
230 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
231 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
232 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
233 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
234 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
235 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
236 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
237 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
238 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
239 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
240 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
241 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
242 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
243 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
244 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
245 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
246 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
247 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
248 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
249 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
250 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
251 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
252 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
253 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
254 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
255 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
256 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
257 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
258 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
259 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
260 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
261 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
262 }
263 };
264
265 #if USE_SIMD16_FRONTEND
266 template<size_t SimdWidth>
267 struct GatherScissors_simd16
268 {
269 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
270 simd16scalari &scisXmin, simd16scalari &scisYmin,
271 simd16scalari &scisXmax, simd16scalari &scisYmax)
272 {
273 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
274 }
275 };
276
277 template<>
278 struct GatherScissors_simd16<16>
279 {
280 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
281 simd16scalari &scisXmin, simd16scalari &scisYmin,
282 simd16scalari &scisXmax, simd16scalari &scisYmax)
283 {
284 scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
285 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
286 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
287 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
288 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
289 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
290 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
291 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
292 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
293 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
294 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
295 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
296 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
297 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
298 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
299 pScissorsInFixedPoint[pViewportIndex[15]].xmin);
300
301 scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
302 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
303 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
304 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
305 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
306 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
307 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
308 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
309 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
310 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
311 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
312 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
313 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
314 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
315 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
316 pScissorsInFixedPoint[pViewportIndex[15]].ymin);
317
318 scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
319 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
320 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
321 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
322 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
323 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
324 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
325 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
326 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
327 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
328 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
329 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
330 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
331 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
332 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
333 pScissorsInFixedPoint[pViewportIndex[15]].xmax);
334
335 scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
336 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
337 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
338 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
339 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
340 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
341 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
342 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
343 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
344 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
345 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
346 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
347 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
348 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
349 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
350 pScissorsInFixedPoint[pViewportIndex[15]].ymax);
351 }
352 };
353
354 #endif
355 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
356
357 struct ProcessAttributesChooser
358 {
359 typedef PFN_PROCESS_ATTRIBUTES FuncType;
360
361 template <typename... ArgsB>
362 static FuncType GetFunc()
363 {
364 return ProcessAttributes<ArgsB...>;
365 }
366 };
367
368 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
369 {
370 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
371 }
372
373 //////////////////////////////////////////////////////////////////////////
374 /// @brief Processes enabled user clip distances. Loads the active clip
375 /// distances from the PA, sets up barycentric equations, and
376 /// stores the results to the output buffer
377 /// @param pa - Primitive Assembly state
378 /// @param primIndex - primitive index to process
379 /// @param clipDistMask - mask of enabled clip distances
380 /// @param pUserClipBuffer - buffer to store results
381 template<uint32_t NumVerts>
382 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer)
383 {
384 DWORD clipDist;
385 while (_BitScanForward(&clipDist, clipDistMask))
386 {
387 clipDistMask &= ~(1 << clipDist);
388 uint32_t clipSlot = clipDist >> 2;
389 uint32_t clipComp = clipDist & 0x3;
390 uint32_t clipAttribSlot = clipSlot == 0 ?
391 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
392
393 __m128 primClipDist[3];
394 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
395
396 float vertClipDist[NumVerts];
397 for (uint32_t e = 0; e < NumVerts; ++e)
398 {
399 OSALIGNSIMD(float) aVertClipDist[4];
400 _mm_store_ps(aVertClipDist, primClipDist[e]);
401 vertClipDist[e] = aVertClipDist[clipComp];
402 };
403
404 // setup plane equations for barycentric interpolation in the backend
405 float baryCoeff[NumVerts];
406 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
407 for (uint32_t e = 0; e < NumVerts - 1; ++e)
408 {
409 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
410 }
411 baryCoeff[NumVerts - 1] = last;
412
413 for (uint32_t e = 0; e < NumVerts; ++e)
414 {
415 *(pUserClipBuffer++) = baryCoeff[e];
416 }
417 }
418 }
419
420 //////////////////////////////////////////////////////////////////////////
421 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
422 /// culling, viewport transform, etc.
423 /// @param pDC - pointer to draw context.
424 /// @param pa - The primitive assembly object.
425 /// @param workerId - thread's worker id. Even thread has a unique id.
426 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
427 /// @param primID - Primitive ID for each triangle.
428 /// @param viewportIdx - viewport array index for each triangle.
429 /// @tparam CT - ConservativeRastFETraits
430 template <typename CT>
431 void BinTriangles(
432 DRAW_CONTEXT *pDC,
433 PA_STATE& pa,
434 uint32_t workerId,
435 simdvector tri[3],
436 uint32_t triMask,
437 simdscalari primID,
438 simdscalari viewportIdx)
439 {
440 SWR_CONTEXT *pContext = pDC->pContext;
441
442 AR_BEGIN(FEBinTriangles, pDC->drawId);
443
444 const API_STATE& state = GetApiState(pDC);
445 const SWR_RASTSTATE& rastState = state.rastState;
446 const SWR_FRONTEND_STATE& feState = state.frontendState;
447 const SWR_GS_STATE& gsState = state.gsState;
448 MacroTileMgr *pTileMgr = pDC->pTileMgr;
449
450 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
451 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
452 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
453
454 if (feState.vpTransformDisable)
455 {
456 // RHW is passed in directly when VP transform is disabled
457 vRecipW0 = tri[0].v[3];
458 vRecipW1 = tri[1].v[3];
459 vRecipW2 = tri[2].v[3];
460 }
461 else
462 {
463 // Perspective divide
464 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
465 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
466 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
467
468 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
469 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
470 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
471
472 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
473 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
474 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
475
476 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
477 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
478 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
479
480 // Viewport transform to screen space coords
481 if (state.gsState.emitsViewportArrayIndex)
482 {
483 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
484 }
485 else
486 {
487 viewportTransform<3>(tri, state.vpMatrices);
488 }
489 }
490
491 // Adjust for pixel center location
492 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
493 tri[0].x = _simd_add_ps(tri[0].x, offset);
494 tri[0].y = _simd_add_ps(tri[0].y, offset);
495
496 tri[1].x = _simd_add_ps(tri[1].x, offset);
497 tri[1].y = _simd_add_ps(tri[1].y, offset);
498
499 tri[2].x = _simd_add_ps(tri[2].x, offset);
500 tri[2].y = _simd_add_ps(tri[2].y, offset);
501
502 simdscalari vXi[3], vYi[3];
503 // Set vXi, vYi to required fixed point precision
504 FPToFixedPoint(tri, vXi, vYi);
505
506 // triangle setup
507 simdscalari vAi[3], vBi[3];
508 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
509
510 // determinant
511 simdscalari vDet[2];
512 calcDeterminantIntVertical(vAi, vBi, vDet);
513
514 // cull zero area
515 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
516 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
517
518 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
519
520 uint32_t origTriMask = triMask;
521 // don't cull degenerate triangles if we're conservatively rasterizing
522 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
523 {
524 triMask &= ~cullZeroAreaMask;
525 }
526
527 // determine front winding tris
528 // CW +det
529 // CCW det < 0;
530 // 0 area triangles are marked as backfacing regardless of winding order,
531 // which is required behavior for conservative rast and wireframe rendering
532 uint32_t frontWindingTris;
533 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
534 {
535 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
536 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
537 }
538 else
539 {
540 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
541 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
542 }
543 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
544
545 // cull
546 uint32_t cullTris;
547 switch ((SWR_CULLMODE)rastState.cullMode)
548 {
549 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
550 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
551 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
552 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
553 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
554 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
555 }
556
557 triMask &= ~cullTris;
558
559 if (origTriMask ^ triMask)
560 {
561 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
562 }
563
564 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
565 // compute per tri backface
566 uint32_t frontFaceMask = frontWindingTris;
567 uint32_t *pPrimID = (uint32_t *)&primID;
568 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
569 DWORD triIndex = 0;
570 uint32_t edgeEnable;
571 PFN_WORK_FUNC pfnWork;
572 if (CT::IsConservativeT::value)
573 {
574 // determine which edges of the degenerate tri, if any, are valid to rasterize.
575 // used to call the appropriate templated rasterizer function
576 if (cullZeroAreaMask > 0)
577 {
578 // e0 = v1-v0
579 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
580 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
581 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
582
583 // e1 = v2-v1
584 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
585 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
586 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
587
588 // e2 = v0-v2
589 // if v0 == v1 & v1 == v2, v0 == v2
590 uint32_t e2Mask = e0Mask & e1Mask;
591 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
592
593 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
594 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
595 e0Mask = pdep_u32(e0Mask, 0x00249249);
596 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
597 e1Mask = pdep_u32(e1Mask, 0x00492492);
598 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
599 e2Mask = pdep_u32(e2Mask, 0x00924924);
600
601 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
602 }
603 else
604 {
605 edgeEnable = 0x00FFFFFF;
606 }
607 }
608 else
609 {
610 // degenerate triangles won't be sent to rasterizer; just enable all edges
611 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
612 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
613 }
614
615 if (!triMask)
616 {
617 goto endBinTriangles;
618 }
619
620 // Calc bounding box of triangles
621 simdBBox bbox;
622 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
623
624 // determine if triangle falls between pixel centers and discard
625 // only discard for non-MSAA case and when conservative rast is disabled
626 // (xmin + 127) & ~255
627 // (xmax + 128) & ~255
628 if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
629 (!CT::IsConservativeT::value))
630 {
631 origTriMask = triMask;
632
633 int cullCenterMask;
634 {
635 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
636 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
637 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
638 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
639
640 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
641
642 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
643 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
644 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
645 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
646
647 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
648 vMaskV = _simd_or_si(vMaskH, vMaskV);
649 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
650 }
651
652 triMask &= ~cullCenterMask;
653
654 if (origTriMask ^ triMask)
655 {
656 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
657 }
658 }
659
660 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
661 // Gather the AOS effective scissor rects based on the per-prim VP index.
662 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
663 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
664 if (state.gsState.emitsViewportArrayIndex)
665 {
666 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
667 scisXmin, scisYmin, scisXmax, scisYmax);
668 }
669 else // broadcast fast path for non-VPAI case.
670 {
671 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
672 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
673 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
674 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
675 }
676
677 // Make triangle bbox inclusive
678 bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
679 bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
680
681 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
682 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
683 bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
684 bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
685
686 if (CT::IsConservativeT::value)
687 {
688 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
689 // some area. Bump the xmax/ymax edges out
690 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
691 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
692 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
693 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
694 }
695
696 // Cull tris completely outside scissor
697 {
698 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
699 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
700 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
701 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
702 triMask = triMask & ~maskOutsideScissor;
703 }
704
705 endBinTriangles:
706
707 // Send surviving triangles to the line or point binner based on fill mode
708 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
709 {
710 // Simple non-conformant wireframe mode, useful for debugging.
711 // Construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
712 simdvector line[2];
713 simdscalar recipW[2];
714 line[0] = tri[0];
715 line[1] = tri[1];
716 recipW[0] = vRecipW0;
717 recipW[1] = vRecipW1;
718 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
719
720 line[0] = tri[1];
721 line[1] = tri[2];
722 recipW[0] = vRecipW1;
723 recipW[1] = vRecipW2;
724 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
725
726 line[0] = tri[2];
727 line[1] = tri[0];
728 recipW[0] = vRecipW2;
729 recipW[1] = vRecipW0;
730 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
731
732 AR_END(FEBinTriangles, 1);
733 return;
734 }
735 else if (rastState.fillMode == SWR_FILLMODE_POINT)
736 {
737 // Bin 3 points
738 BinPostSetupPoints(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
739 BinPostSetupPoints(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
740 BinPostSetupPoints(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
741 return;
742 }
743
744 // Convert triangle bbox to macrotile units.
745 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
746 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
747 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
748 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
749
750 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
751 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
752 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
753 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
754 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
755
756 // transpose verts needed for backend
757 /// @todo modify BE to take non-transformed verts
758 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
759 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
760 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
761 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
762 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
763
764 // store render target array index
765 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
766 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
767 {
768 simdvector vRtai[3];
769 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
770 simdscalari vRtaii;
771 vRtaii = _simd_castps_si(vRtai[0].x);
772 _simd_store_si((simdscalari*)aRTAI, vRtaii);
773 }
774 else
775 {
776 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
777 }
778
779 // scan remaining valid triangles and bin each separately
780 while (_BitScanForward(&triIndex, triMask))
781 {
782 uint32_t linkageCount = state.backendState.numAttributes;
783 uint32_t numScalarAttribs = linkageCount * 4;
784
785 BE_WORK work;
786 work.type = DRAW;
787
788 bool isDegenerate;
789 if (CT::IsConservativeT::value)
790 {
791 // only rasterize valid edges if we have a degenerate primitive
792 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
793 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
794 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
795
796 // Degenerate triangles are required to be constant interpolated
797 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
798 }
799 else
800 {
801 isDegenerate = false;
802 work.pfnWork = pfnWork;
803 }
804
805 // Select attribute processor
806 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
807 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
808
809 TRIANGLE_WORK_DESC &desc = work.desc.tri;
810
811 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
812 desc.triFlags.primID = pPrimID[triIndex];
813 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
814 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
815
816 auto pArena = pDC->pArena;
817 SWR_ASSERT(pArena != nullptr);
818
819 // store active attribs
820 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
821 desc.pAttribs = pAttribs;
822 desc.numAttribs = linkageCount;
823 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
824
825 // store triangle vertex data
826 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
827
828 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
829 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
830 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
831 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
832
833 // store user clip distances
834 if (rastState.clipDistanceMask)
835 {
836 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
837 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
838 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
839 }
840
841 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
842 {
843 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
844 {
845 #if KNOB_ENABLE_TOSS_POINTS
846 if (!KNOB_TOSS_SETUP_TRIS)
847 #endif
848 {
849 pTileMgr->enqueue(x, y, &work);
850 }
851 }
852 }
853 triMask &= ~(1 << triIndex);
854 }
855
856 AR_END(FEBinTriangles, 1);
857 }
858
859 #if USE_SIMD16_FRONTEND
860 template <typename CT>
861 void SIMDAPI BinTriangles_simd16(
862 DRAW_CONTEXT *pDC,
863 PA_STATE& pa,
864 uint32_t workerId,
865 simd16vector tri[3],
866 uint32_t triMask,
867 simd16scalari primID,
868 simd16scalari viewportIdx)
869 {
870 SWR_CONTEXT *pContext = pDC->pContext;
871
872 AR_BEGIN(FEBinTriangles, pDC->drawId);
873
874 const API_STATE& state = GetApiState(pDC);
875 const SWR_RASTSTATE& rastState = state.rastState;
876 const SWR_FRONTEND_STATE& feState = state.frontendState;
877 const SWR_GS_STATE& gsState = state.gsState;
878
879 MacroTileMgr *pTileMgr = pDC->pTileMgr;
880
881 simd16scalar vRecipW0 = _simd16_set1_ps(1.0f);
882 simd16scalar vRecipW1 = _simd16_set1_ps(1.0f);
883 simd16scalar vRecipW2 = _simd16_set1_ps(1.0f);
884
885 if (feState.vpTransformDisable)
886 {
887 // RHW is passed in directly when VP transform is disabled
888 vRecipW0 = tri[0].v[3];
889 vRecipW1 = tri[1].v[3];
890 vRecipW2 = tri[2].v[3];
891 }
892 else
893 {
894 // Perspective divide
895 vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[0].w);
896 vRecipW1 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[1].w);
897 vRecipW2 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[2].w);
898
899 tri[0].v[0] = _simd16_mul_ps(tri[0].v[0], vRecipW0);
900 tri[1].v[0] = _simd16_mul_ps(tri[1].v[0], vRecipW1);
901 tri[2].v[0] = _simd16_mul_ps(tri[2].v[0], vRecipW2);
902
903 tri[0].v[1] = _simd16_mul_ps(tri[0].v[1], vRecipW0);
904 tri[1].v[1] = _simd16_mul_ps(tri[1].v[1], vRecipW1);
905 tri[2].v[1] = _simd16_mul_ps(tri[2].v[1], vRecipW2);
906
907 tri[0].v[2] = _simd16_mul_ps(tri[0].v[2], vRecipW0);
908 tri[1].v[2] = _simd16_mul_ps(tri[1].v[2], vRecipW1);
909 tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2);
910
911 // Viewport transform to screen space coords
912 if (state.gsState.emitsViewportArrayIndex)
913 {
914 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
915 }
916 else
917 {
918 viewportTransform<3>(tri, state.vpMatrices);
919 }
920 }
921
922 // Adjust for pixel center location
923 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
924
925 tri[0].x = _simd16_add_ps(tri[0].x, offset);
926 tri[0].y = _simd16_add_ps(tri[0].y, offset);
927
928 tri[1].x = _simd16_add_ps(tri[1].x, offset);
929 tri[1].y = _simd16_add_ps(tri[1].y, offset);
930
931 tri[2].x = _simd16_add_ps(tri[2].x, offset);
932 tri[2].y = _simd16_add_ps(tri[2].y, offset);
933
934 simd16scalari vXi[3], vYi[3];
935
936 // Set vXi, vYi to required fixed point precision
937 FPToFixedPoint(tri, vXi, vYi);
938
939 // triangle setup
940 simd16scalari vAi[3], vBi[3];
941 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
942
943 // determinant
944 simd16scalari vDet[2];
945 calcDeterminantIntVertical(vAi, vBi, vDet);
946
947 // cull zero area
948 uint32_t maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si())));
949 uint32_t maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si())));
950
951 uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
952
953 // don't cull degenerate triangles if we're conservatively rasterizing
954 uint32_t origTriMask = triMask;
955 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
956 {
957 triMask &= ~cullZeroAreaMask;
958 }
959
960 // determine front winding tris
961 // CW +det
962 // CCW det < 0;
963 // 0 area triangles are marked as backfacing regardless of winding order,
964 // which is required behavior for conservative rast and wireframe rendering
965 uint32_t frontWindingTris;
966 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
967 {
968 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[0], _simd16_setzero_si())));
969 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[1], _simd16_setzero_si())));
970 }
971 else
972 {
973 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0])));
974 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1])));
975 }
976 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
977
978 // cull
979 uint32_t cullTris;
980 switch ((SWR_CULLMODE)rastState.cullMode)
981 {
982 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
983 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
984 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
985 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
986 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
987 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
988 }
989
990 triMask &= ~cullTris;
991
992 if (origTriMask ^ triMask)
993 {
994 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
995 }
996
997 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
998 // compute per tri backface
999 uint32_t frontFaceMask = frontWindingTris;
1000 uint32_t *pPrimID = (uint32_t *)&primID;
1001 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1002 DWORD triIndex = 0;
1003
1004 uint32_t edgeEnable;
1005 PFN_WORK_FUNC pfnWork;
1006 if (CT::IsConservativeT::value)
1007 {
1008 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1009 // used to call the appropriate templated rasterizer function
1010 if (cullZeroAreaMask > 0)
1011 {
1012 // e0 = v1-v0
1013 const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]);
1014 const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]);
1015
1016 uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask)));
1017
1018 // e1 = v2-v1
1019 const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]);
1020 const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]);
1021
1022 uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask)));
1023
1024 // e2 = v0-v2
1025 // if v0 == v1 & v1 == v2, v0 == v2
1026 uint32_t e2Mask = e0Mask & e1Mask;
1027 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1028
1029 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1030 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1031 e0Mask = pdep_u32(e0Mask, 0x00249249);
1032
1033 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1034 e1Mask = pdep_u32(e1Mask, 0x00492492);
1035
1036 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1037 e2Mask = pdep_u32(e2Mask, 0x00924924);
1038
1039 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1040 }
1041 else
1042 {
1043 edgeEnable = 0x00FFFFFF;
1044 }
1045 }
1046 else
1047 {
1048 // degenerate triangles won't be sent to rasterizer; just enable all edges
1049 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1050 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
1051 }
1052
1053 if (!triMask)
1054 {
1055 goto endBinTriangles;
1056 }
1057
1058 // Calc bounding box of triangles
1059 simd16BBox bbox;
1060 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1061
1062 // determine if triangle falls between pixel centers and discard
1063 // only discard for non-MSAA case and when conservative rast is disabled
1064 // (xmin + 127) & ~255
1065 // (xmax + 128) & ~255
1066 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
1067 (!CT::IsConservativeT::value))
1068 {
1069 origTriMask = triMask;
1070
1071 int cullCenterMask;
1072
1073 {
1074 simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127));
1075 xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255));
1076 simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128));
1077 xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255));
1078
1079 simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax);
1080
1081 simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127));
1082 ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255));
1083 simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128));
1084 ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255));
1085
1086 simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax);
1087
1088 vMaskV = _simd16_or_si(vMaskH, vMaskV);
1089 cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV));
1090 }
1091
1092 triMask &= ~cullCenterMask;
1093
1094 if (origTriMask ^ triMask)
1095 {
1096 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1097 }
1098 }
1099
1100 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1101 // Gather the AOS effective scissor rects based on the per-prim VP index.
1102 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1103 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
1104
1105 if (state.gsState.emitsViewportArrayIndex)
1106 {
1107 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1108 scisXmin, scisYmin, scisXmax, scisYmax);
1109 }
1110 else // broadcast fast path for non-VPAI case.
1111 {
1112 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1113 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1114 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1115 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1116 }
1117
1118 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
1119 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
1120 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
1121 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
1122
1123 if (CT::IsConservativeT::value)
1124 {
1125 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1126 // some area. Bump the xmax/ymax edges out
1127 simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax);
1128 bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom);
1129 simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax);
1130 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight);
1131 }
1132
1133 // Cull tris completely outside scissor
1134 {
1135 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
1136 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
1137 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
1138 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
1139 triMask = triMask & ~maskOutsideScissor;
1140 }
1141
1142 endBinTriangles:
1143
1144 // Send surviving triangles to the line or point binner based on fill mode
1145 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
1146 {
1147 // Simple non-conformant wireframe mode, useful for debugging
1148 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1149 simd16vector line[2];
1150 simd16scalar recipW[2];
1151 line[0] = tri[0];
1152 line[1] = tri[1];
1153 recipW[0] = vRecipW0;
1154 recipW[1] = vRecipW1;
1155 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1156
1157 line[0] = tri[1];
1158 line[1] = tri[2];
1159 recipW[0] = vRecipW1;
1160 recipW[1] = vRecipW2;
1161 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1162
1163 line[0] = tri[2];
1164 line[1] = tri[0];
1165 recipW[0] = vRecipW2;
1166 recipW[1] = vRecipW0;
1167 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1168
1169 AR_END(FEBinTriangles, 1);
1170 return;
1171 }
1172 else if (rastState.fillMode == SWR_FILLMODE_POINT)
1173 {
1174 // Bin 3 points
1175 BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
1176 BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
1177 BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
1178 return;
1179 }
1180
1181 // Convert triangle bbox to macrotile units.
1182 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1183 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1184 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1185 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1186
1187 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
1188
1189 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
1190 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
1191 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
1192 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
1193
1194 // transpose verts needed for backend
1195 /// @todo modify BE to take non-transformed verts
1196 __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1197 __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1198 __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1199 __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1200
1201 vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
1202 vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
1203 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0));
1204 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0));
1205
1206 vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1));
1207 vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1));
1208 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1));
1209 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1));
1210
1211 // store render target array index
1212 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1213 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1214 {
1215 simd16vector vRtai[3];
1216 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
1217 simd16scalari vRtaii;
1218 vRtaii = _simd16_castps_si(vRtai[0].x);
1219 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1220 }
1221 else
1222 {
1223 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1224 }
1225
1226
1227 // scan remaining valid triangles and bin each separately
1228 while (_BitScanForward(&triIndex, triMask))
1229 {
1230 uint32_t linkageCount = state.backendState.numAttributes;
1231 uint32_t numScalarAttribs = linkageCount * 4;
1232
1233 BE_WORK work;
1234 work.type = DRAW;
1235
1236 bool isDegenerate;
1237 if (CT::IsConservativeT::value)
1238 {
1239 // only rasterize valid edges if we have a degenerate primitive
1240 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1241 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1242 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1243
1244 // Degenerate triangles are required to be constant interpolated
1245 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1246 }
1247 else
1248 {
1249 isDegenerate = false;
1250 work.pfnWork = pfnWork;
1251 }
1252
1253 // Select attribute processor
1254 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1255 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1256
1257 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1258
1259 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1260 desc.triFlags.primID = pPrimID[triIndex];
1261 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1262 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1263
1264 auto pArena = pDC->pArena;
1265 SWR_ASSERT(pArena != nullptr);
1266
1267 // store active attribs
1268 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1269 desc.pAttribs = pAttribs;
1270 desc.numAttribs = linkageCount;
1271 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1272
1273 // store triangle vertex data
1274 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1275
1276 {
1277 const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
1278 const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH
1279
1280 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
1281 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
1282 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
1283 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
1284 }
1285
1286 // store user clip distances
1287 if (rastState.clipDistanceMask)
1288 {
1289 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1290 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1291 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1292 }
1293
1294 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1295 {
1296 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1297 {
1298 #if KNOB_ENABLE_TOSS_POINTS
1299 if (!KNOB_TOSS_SETUP_TRIS)
1300 #endif
1301 {
1302 pTileMgr->enqueue(x, y, &work);
1303 }
1304 }
1305 }
1306
1307 triMask &= ~(1 << triIndex);
1308 }
1309
1310 AR_END(FEBinTriangles, 1);
1311 }
1312
1313 #endif
1314 struct FEBinTrianglesChooser
1315 {
1316 typedef PFN_PROCESS_PRIMS FuncType;
1317
1318 template <typename... ArgsB>
1319 static FuncType GetFunc()
1320 {
1321 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1322 }
1323 };
1324
1325 // Selector for correct templated BinTrinagles function
1326 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1327 {
1328 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1329 }
1330
1331 #if USE_SIMD16_FRONTEND
1332 struct FEBinTrianglesChooser_simd16
1333 {
1334 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1335
1336 template <typename... ArgsB>
1337 static FuncType GetFunc()
1338 {
1339 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1340 }
1341 };
1342
1343 // Selector for correct templated BinTrinagles function
1344 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1345 {
1346 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1347 }
1348
1349 #endif
1350
1351 void BinPostSetupPoints(
1352 DRAW_CONTEXT *pDC,
1353 PA_STATE& pa,
1354 uint32_t workerId,
1355 simdvector prim[],
1356 uint32_t primMask,
1357 simdscalari primID,
1358 simdscalari viewportIdx)
1359 {
1360 SWR_CONTEXT *pContext = pDC->pContext;
1361
1362 AR_BEGIN(FEBinPoints, pDC->drawId);
1363
1364 simdvector& primVerts = prim[0];
1365
1366 const API_STATE& state = GetApiState(pDC);
1367 const SWR_GS_STATE& gsState = state.gsState;
1368 const SWR_RASTSTATE& rastState = state.rastState;
1369 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1370
1371 // Select attribute processor
1372 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1373 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1374
1375 // convert to fixed point
1376 simdscalari vXi, vYi;
1377 vXi = fpToFixedPointVertical(primVerts.x);
1378 vYi = fpToFixedPointVertical(primVerts.y);
1379
1380 if (CanUseSimplePoints(pDC))
1381 {
1382 // adjust for ymin-xmin rule
1383 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1384 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1385
1386 // cull points off the ymin-xmin edge of the viewport
1387 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1388 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1389
1390 // compute macro tile coordinates
1391 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1392 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1393
1394 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1395 _simd_store_si((simdscalari*)aMacroX, macroX);
1396 _simd_store_si((simdscalari*)aMacroY, macroY);
1397
1398 // compute raster tile coordinates
1399 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1400 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1401
1402 // compute raster tile relative x,y for coverage mask
1403 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1404 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1405
1406 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1407 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1408
1409 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1410 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1411 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1412 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1413
1414 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1415 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1416 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1417 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1418
1419 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1420 _simd_store_ps((float*)aZ, primVerts.z);
1421
1422 // store render target array index
1423 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1424 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1425 {
1426 simdvector vRtai;
1427 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
1428 simdscalari vRtaii = _simd_castps_si(vRtai.x);
1429 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1430 }
1431 else
1432 {
1433 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1434 }
1435
1436 uint32_t *pPrimID = (uint32_t *)&primID;
1437 DWORD primIndex = 0;
1438
1439 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1440
1441 // scan remaining valid triangles and bin each separately
1442 while (_BitScanForward(&primIndex, primMask))
1443 {
1444 uint32_t linkageCount = backendState.numAttributes;
1445 uint32_t numScalarAttribs = linkageCount * 4;
1446
1447 BE_WORK work;
1448 work.type = DRAW;
1449
1450 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1451
1452 // points are always front facing
1453 desc.triFlags.frontFacing = 1;
1454 desc.triFlags.primID = pPrimID[primIndex];
1455 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1456 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1457
1458 work.pfnWork = RasterizeSimplePoint;
1459
1460 auto pArena = pDC->pArena;
1461 SWR_ASSERT(pArena != nullptr);
1462
1463 // store attributes
1464 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1465 desc.pAttribs = pAttribs;
1466 desc.numAttribs = linkageCount;
1467
1468 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1469
1470 // store raster tile aligned x, y, perspective correct z
1471 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1472 desc.pTriBuffer = pTriBuffer;
1473 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1474 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1475 *pTriBuffer = aZ[primIndex];
1476
1477 uint32_t tX = aTileRelativeX[primIndex];
1478 uint32_t tY = aTileRelativeY[primIndex];
1479
1480 // pack the relative x,y into the coverageMask, the rasterizer will
1481 // generate the true coverage mask from it
1482 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1483
1484 // bin it
1485 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1486 #if KNOB_ENABLE_TOSS_POINTS
1487 if (!KNOB_TOSS_SETUP_TRIS)
1488 #endif
1489 {
1490 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1491 }
1492 primMask &= ~(1 << primIndex);
1493 }
1494 }
1495 else
1496 {
1497 // non simple points need to be potentially binned to multiple macro tiles
1498 simdscalar vPointSize;
1499 if (rastState.pointParam)
1500 {
1501 simdvector size[3];
1502 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
1503 vPointSize = size[0].x;
1504 }
1505 else
1506 {
1507 vPointSize = _simd_set1_ps(rastState.pointSize);
1508 }
1509
1510 // bloat point to bbox
1511 simdBBox bbox;
1512 bbox.xmin = bbox.xmax = vXi;
1513 bbox.ymin = bbox.ymax = vYi;
1514
1515 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
1516 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1517 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1518 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1519 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1520 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1521
1522 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1523 // Gather the AOS effective scissor rects based on the per-prim VP index.
1524 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1525 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1526 if (state.gsState.emitsViewportArrayIndex)
1527 {
1528 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1529 scisXmin, scisYmin, scisXmax, scisYmax);
1530 }
1531 else // broadcast fast path for non-VPAI case.
1532 {
1533 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1534 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1535 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1536 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1537 }
1538
1539 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1540 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1541 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1542 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1543
1544 // Cull bloated points completely outside scissor
1545 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1546 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1547 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1548 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1549 primMask = primMask & ~maskOutsideScissor;
1550
1551 // Convert bbox to macrotile units.
1552 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1553 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1554 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1555 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1556
1557 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1558 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1559 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1560 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1561 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1562
1563 // store render target array index
1564 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1565 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1566 {
1567 simdvector vRtai[2];
1568 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1569 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
1570 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1571 }
1572 else
1573 {
1574 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1575 }
1576
1577 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
1578 _simd_store_ps((float*)aPointSize, vPointSize);
1579
1580 uint32_t *pPrimID = (uint32_t *)&primID;
1581
1582 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
1583 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
1584 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
1585
1586 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
1587 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
1588 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
1589
1590 // scan remaining valid prims and bin each separately
1591 const SWR_BACKEND_STATE& backendState = state.backendState;
1592 DWORD primIndex;
1593 while (_BitScanForward(&primIndex, primMask))
1594 {
1595 uint32_t linkageCount = backendState.numAttributes;
1596 uint32_t numScalarAttribs = linkageCount * 4;
1597
1598 BE_WORK work;
1599 work.type = DRAW;
1600
1601 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1602
1603 desc.triFlags.frontFacing = 1;
1604 desc.triFlags.primID = pPrimID[primIndex];
1605 desc.triFlags.pointSize = aPointSize[primIndex];
1606 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1607 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1608
1609 work.pfnWork = RasterizeTriPoint;
1610
1611 auto pArena = pDC->pArena;
1612 SWR_ASSERT(pArena != nullptr);
1613
1614 // store active attribs
1615 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1616 desc.numAttribs = linkageCount;
1617 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1618
1619 // store point vertex data
1620 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1621 desc.pTriBuffer = pTriBuffer;
1622 *pTriBuffer++ = aPrimVertsX[primIndex];
1623 *pTriBuffer++ = aPrimVertsY[primIndex];
1624 *pTriBuffer = aPrimVertsZ[primIndex];
1625
1626 // store user clip distances
1627 if (rastState.clipDistanceMask)
1628 {
1629 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1630 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1631 float dists[8];
1632 float one = 1.0f;
1633 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
1634 for (uint32_t i = 0; i < numClipDist; i++) {
1635 desc.pUserClipBuffer[3*i + 0] = 0.0f;
1636 desc.pUserClipBuffer[3*i + 1] = 0.0f;
1637 desc.pUserClipBuffer[3*i + 2] = dists[i];
1638 }
1639 }
1640
1641 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1642 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1643 {
1644 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1645 {
1646 #if KNOB_ENABLE_TOSS_POINTS
1647 if (!KNOB_TOSS_SETUP_TRIS)
1648 #endif
1649 {
1650 pTileMgr->enqueue(x, y, &work);
1651 }
1652 }
1653 }
1654
1655 primMask &= ~(1 << primIndex);
1656 }
1657 }
1658
1659 AR_END(FEBinPoints, 1);
1660 }
1661
1662 //////////////////////////////////////////////////////////////////////////
1663 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1664 /// @param pDC - pointer to draw context.
1665 /// @param pa - The primitive assembly object.
1666 /// @param workerId - thread's worker id. Even thread has a unique id.
1667 /// @param tri - Contains point position data for SIMDs worth of points.
1668 /// @param primID - Primitive ID for each point.
1669 void BinPoints(
1670 DRAW_CONTEXT *pDC,
1671 PA_STATE& pa,
1672 uint32_t workerId,
1673 simdvector prim[3],
1674 uint32_t primMask,
1675 simdscalari primID,
1676 simdscalari viewportIdx)
1677 {
1678 simdvector& primVerts = prim[0];
1679
1680 const API_STATE& state = GetApiState(pDC);
1681 const SWR_FRONTEND_STATE& feState = state.frontendState;
1682 const SWR_RASTSTATE& rastState = state.rastState;
1683
1684 if (!feState.vpTransformDisable)
1685 {
1686 // perspective divide
1687 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1688 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1689 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1690 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1691
1692 // viewport transform to screen coords
1693 if (state.gsState.emitsViewportArrayIndex)
1694 {
1695 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
1696 }
1697 else
1698 {
1699 viewportTransform<1>(&primVerts, state.vpMatrices);
1700 }
1701 }
1702
1703 // adjust for pixel center location
1704 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1705 primVerts.x = _simd_add_ps(primVerts.x, offset);
1706 primVerts.y = _simd_add_ps(primVerts.y, offset);
1707
1708 BinPostSetupPoints(
1709 pDC,
1710 pa,
1711 workerId,
1712 prim,
1713 primMask,
1714 primID,
1715 viewportIdx);
1716 }
1717
1718 #if USE_SIMD16_FRONTEND
1719 void BinPostSetupPoints_simd16(
1720 DRAW_CONTEXT *pDC,
1721 PA_STATE& pa,
1722 uint32_t workerId,
1723 simd16vector prim[],
1724 uint32_t primMask,
1725 simd16scalari primID,
1726 simd16scalari viewportIdx)
1727 {
1728 SWR_CONTEXT *pContext = pDC->pContext;
1729
1730 AR_BEGIN(FEBinPoints, pDC->drawId);
1731
1732 simd16vector& primVerts = prim[0];
1733
1734 const API_STATE& state = GetApiState(pDC);
1735 const SWR_GS_STATE& gsState = state.gsState;
1736 const SWR_RASTSTATE& rastState = state.rastState;
1737 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1738
1739 // Select attribute processor
1740 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1741 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1742
1743 // convert to fixed point
1744 simd16scalari vXi, vYi;
1745
1746 vXi = fpToFixedPointVertical(primVerts.x);
1747 vYi = fpToFixedPointVertical(primVerts.y);
1748
1749 if (CanUseSimplePoints(pDC))
1750 {
1751 // adjust for ymin-xmin rule
1752 vXi = _simd16_sub_epi32(vXi, _simd16_set1_epi32(1));
1753 vYi = _simd16_sub_epi32(vYi, _simd16_set1_epi32(1));
1754
1755 // cull points off the ymin-xmin edge of the viewport
1756 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi));
1757 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi));
1758
1759 // compute macro tile coordinates
1760 simd16scalari macroX = _simd16_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1761 simd16scalari macroY = _simd16_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1762
1763 OSALIGNSIMD16(uint32_t) aMacroX[KNOB_SIMD16_WIDTH], aMacroY[KNOB_SIMD16_WIDTH];
1764
1765 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroX), macroX);
1766 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroY), macroY);
1767
1768 // compute raster tile coordinates
1769 simd16scalari rasterX = _simd16_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1770 simd16scalari rasterY = _simd16_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1771
1772 // compute raster tile relative x,y for coverage mask
1773 simd16scalari tileAlignedX = _simd16_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1774 simd16scalari tileAlignedY = _simd16_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1775
1776 simd16scalari tileRelativeX = _simd16_sub_epi32(_simd16_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1777 simd16scalari tileRelativeY = _simd16_sub_epi32(_simd16_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1778
1779 OSALIGNSIMD16(uint32_t) aTileRelativeX[KNOB_SIMD16_WIDTH];
1780 OSALIGNSIMD16(uint32_t) aTileRelativeY[KNOB_SIMD16_WIDTH];
1781
1782 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeX), tileRelativeX);
1783 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeY), tileRelativeY);
1784
1785 OSALIGNSIMD16(uint32_t) aTileAlignedX[KNOB_SIMD16_WIDTH];
1786 OSALIGNSIMD16(uint32_t) aTileAlignedY[KNOB_SIMD16_WIDTH];
1787
1788 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedX), tileAlignedX);
1789 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedY), tileAlignedY);
1790
1791 OSALIGNSIMD16(float) aZ[KNOB_SIMD16_WIDTH];
1792 _simd16_store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1793
1794 // store render target array index
1795 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1796 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1797 {
1798 simd16vector vRtai;
1799 pa.Assemble_simd16(VERTEX_RTAI_SLOT, &vRtai);
1800 simd16scalari vRtaii = _simd16_castps_si(vRtai.x);
1801 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1802 }
1803 else
1804 {
1805 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1806 }
1807
1808 uint32_t *pPrimID = (uint32_t *)&primID;
1809 DWORD primIndex = 0;
1810
1811 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1812
1813 // scan remaining valid triangles and bin each separately
1814 while (_BitScanForward(&primIndex, primMask))
1815 {
1816 uint32_t linkageCount = backendState.numAttributes;
1817 uint32_t numScalarAttribs = linkageCount * 4;
1818
1819 BE_WORK work;
1820 work.type = DRAW;
1821
1822 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1823
1824 // points are always front facing
1825 desc.triFlags.frontFacing = 1;
1826 desc.triFlags.primID = pPrimID[primIndex];
1827 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1828 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1829
1830 work.pfnWork = RasterizeSimplePoint;
1831
1832 auto pArena = pDC->pArena;
1833 SWR_ASSERT(pArena != nullptr);
1834
1835 // store attributes
1836 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1837 desc.pAttribs = pAttribs;
1838 desc.numAttribs = linkageCount;
1839
1840 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1841
1842 // store raster tile aligned x, y, perspective correct z
1843 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1844 desc.pTriBuffer = pTriBuffer;
1845 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1846 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1847 *pTriBuffer = aZ[primIndex];
1848
1849 uint32_t tX = aTileRelativeX[primIndex];
1850 uint32_t tY = aTileRelativeY[primIndex];
1851
1852 // pack the relative x,y into the coverageMask, the rasterizer will
1853 // generate the true coverage mask from it
1854 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1855
1856 // bin it
1857 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1858 #if KNOB_ENABLE_TOSS_POINTS
1859 if (!KNOB_TOSS_SETUP_TRIS)
1860 #endif
1861 {
1862 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1863 }
1864
1865 primMask &= ~(1 << primIndex);
1866 }
1867 }
1868 else
1869 {
1870 // non simple points need to be potentially binned to multiple macro tiles
1871 simd16scalar vPointSize;
1872
1873 if (rastState.pointParam)
1874 {
1875 simd16vector size[3];
1876 pa.Assemble_simd16(VERTEX_POINT_SIZE_SLOT, size);
1877 vPointSize = size[0].x;
1878 }
1879 else
1880 {
1881 vPointSize = _simd16_set1_ps(rastState.pointSize);
1882 }
1883
1884 // bloat point to bbox
1885 simd16BBox bbox;
1886
1887 bbox.xmin = bbox.xmax = vXi;
1888 bbox.ymin = bbox.ymax = vYi;
1889
1890 simd16scalar vHalfWidth = _simd16_mul_ps(vPointSize, _simd16_set1_ps(0.5f));
1891 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1892
1893 bbox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
1894 bbox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
1895 bbox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
1896 bbox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
1897
1898 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1899 // Gather the AOS effective scissor rects based on the per-prim VP index.
1900 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1901 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
1902 if (state.gsState.emitsViewportArrayIndex)
1903 {
1904 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1905 scisXmin, scisYmin, scisXmax, scisYmax);
1906 }
1907 else // broadcast fast path for non-VPAI case.
1908 {
1909 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1910 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1911 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1912 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1913 }
1914
1915 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
1916 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
1917 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
1918 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
1919
1920 // Cull bloated points completely outside scissor
1921 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
1922 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
1923 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
1924 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
1925 primMask = primMask & ~maskOutsideScissor;
1926
1927 // Convert bbox to macrotile units.
1928 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1929 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1930 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1931 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1932
1933 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
1934
1935 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
1936 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
1937 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
1938 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
1939
1940 // store render target array index
1941 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1942 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1943 {
1944 simd16vector vRtai[2];
1945 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
1946 simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
1947 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1948 }
1949 else
1950 {
1951 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1952 }
1953
1954 OSALIGNSIMD16(float) aPointSize[KNOB_SIMD16_WIDTH];
1955 _simd16_store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1956
1957 uint32_t *pPrimID = (uint32_t *)&primID;
1958
1959 OSALIGNSIMD16(float) aPrimVertsX[KNOB_SIMD16_WIDTH];
1960 OSALIGNSIMD16(float) aPrimVertsY[KNOB_SIMD16_WIDTH];
1961 OSALIGNSIMD16(float) aPrimVertsZ[KNOB_SIMD16_WIDTH];
1962
1963 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1964 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1965 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1966
1967 // scan remaining valid prims and bin each separately
1968 const SWR_BACKEND_STATE& backendState = state.backendState;
1969 DWORD primIndex;
1970 while (_BitScanForward(&primIndex, primMask))
1971 {
1972 uint32_t linkageCount = backendState.numAttributes;
1973 uint32_t numScalarAttribs = linkageCount * 4;
1974
1975 BE_WORK work;
1976 work.type = DRAW;
1977
1978 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1979
1980 desc.triFlags.frontFacing = 1;
1981 desc.triFlags.primID = pPrimID[primIndex];
1982 desc.triFlags.pointSize = aPointSize[primIndex];
1983 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1984 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1985
1986 work.pfnWork = RasterizeTriPoint;
1987
1988 auto pArena = pDC->pArena;
1989 SWR_ASSERT(pArena != nullptr);
1990
1991 // store active attribs
1992 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1993 desc.numAttribs = linkageCount;
1994 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1995
1996 // store point vertex data
1997 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1998 desc.pTriBuffer = pTriBuffer;
1999 *pTriBuffer++ = aPrimVertsX[primIndex];
2000 *pTriBuffer++ = aPrimVertsY[primIndex];
2001 *pTriBuffer = aPrimVertsZ[primIndex];
2002
2003 // store user clip distances
2004 if (rastState.clipDistanceMask)
2005 {
2006 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2007 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2008 float dists[8];
2009 float one = 1.0f;
2010 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
2011 for (uint32_t i = 0; i < numClipDist; i++) {
2012 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
2013 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
2014 desc.pUserClipBuffer[3 * i + 2] = dists[i];
2015 }
2016 }
2017
2018 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2019 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2020 {
2021 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2022 {
2023 #if KNOB_ENABLE_TOSS_POINTS
2024 if (!KNOB_TOSS_SETUP_TRIS)
2025 #endif
2026 {
2027 pTileMgr->enqueue(x, y, &work);
2028 }
2029 }
2030 }
2031
2032 primMask &= ~(1 << primIndex);
2033 }
2034 }
2035
2036 AR_END(FEBinPoints, 1);
2037 }
2038
2039 void SIMDAPI BinPoints_simd16(
2040 DRAW_CONTEXT *pDC,
2041 PA_STATE& pa,
2042 uint32_t workerId,
2043 simd16vector prim[3],
2044 uint32_t primMask,
2045 simd16scalari primID,
2046 simd16scalari viewportIdx)
2047 {
2048 simd16vector& primVerts = prim[0];
2049
2050 const API_STATE& state = GetApiState(pDC);
2051 const SWR_FRONTEND_STATE& feState = state.frontendState;
2052 const SWR_RASTSTATE& rastState = state.rastState;
2053
2054 if (!feState.vpTransformDisable)
2055 {
2056 // perspective divide
2057 simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w);
2058
2059 primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0);
2060 primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0);
2061 primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
2062
2063 // viewport transform to screen coords
2064 if (state.gsState.emitsViewportArrayIndex)
2065 {
2066 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2067 }
2068 else
2069 {
2070 viewportTransform<1>(&primVerts, state.vpMatrices);
2071 }
2072 }
2073
2074 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
2075
2076 primVerts.x = _simd16_add_ps(primVerts.x, offset);
2077 primVerts.y = _simd16_add_ps(primVerts.y, offset);
2078
2079 BinPostSetupPoints_simd16(
2080 pDC,
2081 pa,
2082 workerId,
2083 prim,
2084 primMask,
2085 primID,
2086 viewportIdx);
2087 }
2088
2089 #endif
2090 //////////////////////////////////////////////////////////////////////////
2091 /// @brief Bin SIMD lines to the backend.
2092 /// @param pDC - pointer to draw context.
2093 /// @param pa - The primitive assembly object.
2094 /// @param workerId - thread's worker id. Even thread has a unique id.
2095 /// @param tri - Contains line position data for SIMDs worth of points.
2096 /// @param primID - Primitive ID for each line.
2097 /// @param viewportIdx - Viewport Array Index for each line.
2098 void BinPostSetupLines(
2099 DRAW_CONTEXT *pDC,
2100 PA_STATE& pa,
2101 uint32_t workerId,
2102 simdvector prim[],
2103 simdscalar recipW[],
2104 uint32_t primMask,
2105 simdscalari primID,
2106 simdscalari viewportIdx)
2107 {
2108 SWR_CONTEXT *pContext = pDC->pContext;
2109
2110 AR_BEGIN(FEBinLines, pDC->drawId);
2111
2112 const API_STATE& state = GetApiState(pDC);
2113 const SWR_RASTSTATE& rastState = state.rastState;
2114 const SWR_GS_STATE& gsState = state.gsState;
2115
2116 // Select attribute processor
2117 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2118 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2119
2120 simdscalar& vRecipW0 = recipW[0];
2121 simdscalar& vRecipW1 = recipW[1];
2122
2123 // convert to fixed point
2124 simdscalari vXi[2], vYi[2];
2125 vXi[0] = fpToFixedPointVertical(prim[0].x);
2126 vYi[0] = fpToFixedPointVertical(prim[0].y);
2127 vXi[1] = fpToFixedPointVertical(prim[1].x);
2128 vYi[1] = fpToFixedPointVertical(prim[1].y);
2129
2130 // compute x-major vs y-major mask
2131 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2132 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2133 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2134 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2135
2136 // cull zero-length lines
2137 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2138 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2139
2140 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2141
2142 uint32_t *pPrimID = (uint32_t *)&primID;
2143 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2144
2145 simdscalar vUnused = _simd_setzero_ps();
2146
2147 // Calc bounding box of lines
2148 simdBBox bbox;
2149 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2150 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2151 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2152 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2153
2154 // bloat bbox by line width along minor axis
2155 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2156 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2157 simdBBox bloatBox;
2158 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2159 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2160 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2161 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2162
2163 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2164 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2165 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2166 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2167
2168 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2169 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2170 if (state.gsState.emitsViewportArrayIndex)
2171 {
2172 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2173 scisXmin, scisYmin, scisXmax, scisYmax);
2174 }
2175 else // broadcast fast path for non-VPAI case.
2176 {
2177 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2178 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2179 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2180 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2181 }
2182
2183 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2184 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2185 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2186 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2187
2188 // Cull prims completely outside scissor
2189 {
2190 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2191 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2192 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2193 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2194 primMask = primMask & ~maskOutsideScissor;
2195 }
2196
2197 if (!primMask)
2198 {
2199 goto endBinLines;
2200 }
2201
2202 // Convert triangle bbox to macrotile units.
2203 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2204 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2205 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2206 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2207
2208 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2209 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2210 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2211 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2212 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2213
2214 // transpose verts needed for backend
2215 /// @todo modify BE to take non-transformed verts
2216 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2217 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2218 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2219 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2220 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2221
2222 // store render target array index
2223 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2224 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2225 {
2226 simdvector vRtai[2];
2227 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2228 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2229 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2230 }
2231 else
2232 {
2233 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2234 }
2235
2236 // scan remaining valid prims and bin each separately
2237 DWORD primIndex;
2238 while (_BitScanForward(&primIndex, primMask))
2239 {
2240 uint32_t linkageCount = state.backendState.numAttributes;
2241 uint32_t numScalarAttribs = linkageCount * 4;
2242
2243 BE_WORK work;
2244 work.type = DRAW;
2245
2246 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2247
2248 desc.triFlags.frontFacing = 1;
2249 desc.triFlags.primID = pPrimID[primIndex];
2250 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2251 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2252 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2253
2254 work.pfnWork = RasterizeLine;
2255
2256 auto pArena = pDC->pArena;
2257 SWR_ASSERT(pArena != nullptr);
2258
2259 // store active attribs
2260 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2261 desc.numAttribs = linkageCount;
2262 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2263
2264 // store line vertex data
2265 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2266 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2267 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2268 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2269 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2270
2271 // store user clip distances
2272 if (rastState.clipDistanceMask)
2273 {
2274 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2275 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2276 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2277 }
2278
2279 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2280 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2281 {
2282 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2283 {
2284 #if KNOB_ENABLE_TOSS_POINTS
2285 if (!KNOB_TOSS_SETUP_TRIS)
2286 #endif
2287 {
2288 pTileMgr->enqueue(x, y, &work);
2289 }
2290 }
2291 }
2292
2293 primMask &= ~(1 << primIndex);
2294 }
2295
2296 endBinLines:
2297
2298 AR_END(FEBinLines, 1);
2299 }
2300
2301 #if USE_SIMD16_FRONTEND
2302 void BinPostSetupLines_simd16(
2303 DRAW_CONTEXT *pDC,
2304 PA_STATE& pa,
2305 uint32_t workerId,
2306 simd16vector prim[],
2307 simd16scalar recipW[],
2308 uint32_t primMask,
2309 simd16scalari primID,
2310 simd16scalari viewportIdx)
2311 {
2312 SWR_CONTEXT *pContext = pDC->pContext;
2313
2314 AR_BEGIN(FEBinLines, pDC->drawId);
2315
2316 const API_STATE& state = GetApiState(pDC);
2317 const SWR_RASTSTATE& rastState = state.rastState;
2318 const SWR_GS_STATE& gsState = state.gsState;
2319
2320 // Select attribute processor
2321 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2322 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2323
2324 simd16scalar& vRecipW0 = recipW[0];
2325 simd16scalar& vRecipW1 = recipW[1];
2326
2327 // convert to fixed point
2328 simd16scalari vXi[2], vYi[2];
2329
2330 vXi[0] = fpToFixedPointVertical(prim[0].x);
2331 vYi[0] = fpToFixedPointVertical(prim[0].y);
2332 vXi[1] = fpToFixedPointVertical(prim[1].x);
2333 vYi[1] = fpToFixedPointVertical(prim[1].y);
2334
2335 // compute x-major vs y-major mask
2336 simd16scalari xLength = _simd16_abs_epi32(_simd16_sub_epi32(vXi[0], vXi[1]));
2337 simd16scalari yLength = _simd16_abs_epi32(_simd16_sub_epi32(vYi[0], vYi[1]));
2338 simd16scalar vYmajorMask = _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength, xLength));
2339 uint32_t yMajorMask = _simd16_movemask_ps(vYmajorMask);
2340
2341 // cull zero-length lines
2342 simd16scalari vZeroLengthMask = _simd16_cmpeq_epi32(xLength, _simd16_setzero_si());
2343 vZeroLengthMask = _simd16_and_si(vZeroLengthMask, _simd16_cmpeq_epi32(yLength, _simd16_setzero_si()));
2344
2345 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask));
2346
2347 uint32_t *pPrimID = (uint32_t *)&primID;
2348 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2349
2350 // Calc bounding box of lines
2351 simd16BBox bbox;
2352 bbox.xmin = _simd16_min_epi32(vXi[0], vXi[1]);
2353 bbox.xmax = _simd16_max_epi32(vXi[0], vXi[1]);
2354 bbox.ymin = _simd16_min_epi32(vYi[0], vYi[1]);
2355 bbox.ymax = _simd16_max_epi32(vYi[0], vYi[1]);
2356
2357 // bloat bbox by line width along minor axis
2358 simd16scalar vHalfWidth = _simd16_set1_ps(rastState.lineWidth / 2.0f);
2359 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2360
2361 simd16BBox bloatBox;
2362
2363 bloatBox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
2364 bloatBox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
2365 bloatBox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
2366 bloatBox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
2367
2368 bbox.xmin = _simd16_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2369 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2370 bbox.ymin = _simd16_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2371 bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2372
2373 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2374 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
2375
2376 if (state.gsState.emitsViewportArrayIndex)
2377 {
2378 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2379 scisXmin, scisYmin, scisXmax, scisYmax);
2380 }
2381 else // broadcast fast path for non-VPAI case.
2382 {
2383 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2384 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2385 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2386 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2387 }
2388
2389 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
2390 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
2391 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
2392 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
2393
2394 // Cull prims completely outside scissor
2395 {
2396 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
2397 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
2398 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
2399 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
2400 primMask = primMask & ~maskOutsideScissor;
2401 }
2402
2403 const simdscalar unused = _simd_setzero_ps();
2404
2405 if (!primMask)
2406 {
2407 goto endBinLines;
2408 }
2409
2410 // Convert triangle bbox to macrotile units.
2411 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2412 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2413 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2414 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2415
2416 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
2417
2418 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
2419 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
2420 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
2421 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
2422
2423 // transpose verts needed for backend
2424 /// @todo modify BE to take non-transformed verts
2425 __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2426 __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2427 __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2428 __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2429
2430 vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
2431 vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
2432 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused);
2433 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), unused);
2434
2435 vTranspose3x8(vHorizX[1], _simd16_extract_ps(prim[0].x, 1), _simd16_extract_ps(prim[1].x, 1), unused);
2436 vTranspose3x8(vHorizY[1], _simd16_extract_ps(prim[0].y, 1), _simd16_extract_ps(prim[1].y, 1), unused);
2437 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(prim[0].z, 1), _simd16_extract_ps(prim[1].z, 1), unused);
2438 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), unused);
2439
2440 // store render target array index
2441 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
2442 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2443 {
2444 simd16vector vRtai[2];
2445 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
2446 simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
2447 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
2448 }
2449 else
2450 {
2451 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
2452 }
2453
2454 // scan remaining valid prims and bin each separately
2455 DWORD primIndex;
2456 while (_BitScanForward(&primIndex, primMask))
2457 {
2458 uint32_t linkageCount = state.backendState.numAttributes;
2459 uint32_t numScalarAttribs = linkageCount * 4;
2460
2461 BE_WORK work;
2462 work.type = DRAW;
2463
2464 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2465
2466 desc.triFlags.frontFacing = 1;
2467 desc.triFlags.primID = pPrimID[primIndex];
2468 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2469 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2470 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2471
2472 work.pfnWork = RasterizeLine;
2473
2474 auto pArena = pDC->pArena;
2475 SWR_ASSERT(pArena != nullptr);
2476
2477 // store active attribs
2478 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2479 desc.numAttribs = linkageCount;
2480 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2481
2482 // store line vertex data
2483 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2484
2485 {
2486 const uint32_t i = primIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
2487 const uint32_t j = primIndex & 7; // triIndex % KNOB_SIMD_WIDTH
2488
2489 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
2490 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
2491 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
2492 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
2493 }
2494
2495 // store user clip distances
2496 if (rastState.clipDistanceMask)
2497 {
2498 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2499 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2500 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2501 }
2502
2503 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2504 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2505 {
2506 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2507 {
2508 #if KNOB_ENABLE_TOSS_POINTS
2509 if (!KNOB_TOSS_SETUP_TRIS)
2510 #endif
2511 {
2512 pTileMgr->enqueue(x, y, &work);
2513 }
2514 }
2515 }
2516
2517 primMask &= ~(1 << primIndex);
2518 }
2519
2520 endBinLines:
2521
2522 AR_END(FEBinLines, 1);
2523 }
2524
2525 #endif
2526 //////////////////////////////////////////////////////////////////////////
2527 /// @brief Bin SIMD lines to the backend.
2528 /// @param pDC - pointer to draw context.
2529 /// @param pa - The primitive assembly object.
2530 /// @param workerId - thread's worker id. Even thread has a unique id.
2531 /// @param tri - Contains line position data for SIMDs worth of points.
2532 /// @param primID - Primitive ID for each line.
2533 /// @param viewportIdx - Viewport Array Index for each line.
2534 void BinLines(
2535 DRAW_CONTEXT *pDC,
2536 PA_STATE& pa,
2537 uint32_t workerId,
2538 simdvector prim[],
2539 uint32_t primMask,
2540 simdscalari primID,
2541 simdscalari viewportIdx)
2542 {
2543 const API_STATE& state = GetApiState(pDC);
2544 const SWR_RASTSTATE& rastState = state.rastState;
2545 const SWR_FRONTEND_STATE& feState = state.frontendState;
2546
2547 simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
2548
2549 if (!feState.vpTransformDisable)
2550 {
2551 // perspective divide
2552 vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2553 vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2554
2555 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
2556 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
2557
2558 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
2559 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
2560
2561 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
2562 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
2563
2564 // viewport transform to screen coords
2565 if (state.gsState.emitsViewportArrayIndex)
2566 {
2567 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2568 }
2569 else
2570 {
2571 viewportTransform<2>(prim, state.vpMatrices);
2572 }
2573 }
2574
2575 // adjust for pixel center location
2576 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2577 prim[0].x = _simd_add_ps(prim[0].x, offset);
2578 prim[0].y = _simd_add_ps(prim[0].y, offset);
2579
2580 prim[1].x = _simd_add_ps(prim[1].x, offset);
2581 prim[1].y = _simd_add_ps(prim[1].y, offset);
2582
2583 BinPostSetupLines(
2584 pDC,
2585 pa,
2586 workerId,
2587 prim,
2588 vRecipW,
2589 primMask,
2590 primID,
2591 viewportIdx);
2592 }
2593
2594 #if USE_SIMD16_FRONTEND
2595 void SIMDAPI BinLines_simd16(
2596 DRAW_CONTEXT *pDC,
2597 PA_STATE& pa,
2598 uint32_t workerId,
2599 simd16vector prim[3],
2600 uint32_t primMask,
2601 simd16scalari primID,
2602 simd16scalari viewportIdx)
2603 {
2604 const API_STATE& state = GetApiState(pDC);
2605 const SWR_RASTSTATE& rastState = state.rastState;
2606 const SWR_FRONTEND_STATE& feState = state.frontendState;
2607
2608 simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) };
2609
2610 if (!feState.vpTransformDisable)
2611 {
2612 // perspective divide
2613 vRecipW[0] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[0].w);
2614 vRecipW[1] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[1].w);
2615
2616 prim[0].v[0] = _simd16_mul_ps(prim[0].v[0], vRecipW[0]);
2617 prim[1].v[0] = _simd16_mul_ps(prim[1].v[0], vRecipW[1]);
2618
2619 prim[0].v[1] = _simd16_mul_ps(prim[0].v[1], vRecipW[0]);
2620 prim[1].v[1] = _simd16_mul_ps(prim[1].v[1], vRecipW[1]);
2621
2622 prim[0].v[2] = _simd16_mul_ps(prim[0].v[2], vRecipW[0]);
2623 prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]);
2624
2625 // viewport transform to screen coords
2626 if (state.gsState.emitsViewportArrayIndex)
2627 {
2628 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2629 }
2630 else
2631 {
2632 viewportTransform<2>(prim, state.vpMatrices);
2633 }
2634 }
2635
2636 // adjust for pixel center location
2637 simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
2638
2639 prim[0].x = _simd16_add_ps(prim[0].x, offset);
2640 prim[0].y = _simd16_add_ps(prim[0].y, offset);
2641
2642 prim[1].x = _simd16_add_ps(prim[1].x, offset);
2643 prim[1].y = _simd16_add_ps(prim[1].y, offset);
2644
2645 BinPostSetupLines_simd16(
2646 pDC,
2647 pa,
2648 workerId,
2649 prim,
2650 vRecipW,
2651 primMask,
2652 primID,
2653 viewportIdx);
2654 }
2655
2656 #endif