1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for the macrotile binner
27 ******************************************************************************/
32 #include "conservativeRast.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
39 void BinPostSetupLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], simdscalar vRecipW
[2], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
40 void BinPostSetupPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
42 #if USE_SIMD16_FRONTEND
43 void BinPostSetupLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], simd16scalar vRecipW
[2], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
44 void BinPostSetupPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
47 //////////////////////////////////////////////////////////////////////////
48 /// @brief Processes attributes for the backend based on linkage mask and
49 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
50 /// @param pDC - Draw context
51 /// @param pa - Primitive Assembly state
52 /// @param linkageMask - Specifies which VS outputs are routed to PS.
53 /// @param pLinkageMap - maps VS attribute slot to PS slot
54 /// @param triIndex - Triangle to process attributes for
55 /// @param pBuffer - Output result
56 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
57 INLINE
void ProcessAttributes(
64 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
65 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
66 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
67 LONG constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
68 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
69 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
71 static const float constTable
[3][4] = {
72 { 0.0f
, 0.0f
, 0.0f
, 0.0f
},
73 { 0.0f
, 0.0f
, 0.0f
, 1.0f
},
74 { 1.0f
, 1.0f
, 1.0f
, 1.0f
}
77 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
80 if (IsSwizzledT::value
)
82 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
83 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ attribSwizzle
.sourceAttrib
;
88 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ i
;
91 __m128 attrib
[3]; // triangle attribs (always 4 wide)
92 float* pAttribStart
= pBuffer
;
94 if (HasConstantInterpT::value
|| IsDegenerate::value
)
96 if (_bittest(&constantInterpMask
, i
))
99 uint32_t adjustedTriIndex
;
100 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
101 static const int32_t quadProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
102 static const uint32_t quadProvokingVertex
[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
103 static const int32_t qstripProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
104 static const uint32_t qstripProvokingVertex
[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
108 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
109 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
112 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
113 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
115 case TOP_TRIANGLE_STRIP
:
116 adjustedTriIndex
= triIndex
;
118 ? tristripProvokingVertex
[provokingVertex
]
122 adjustedTriIndex
= triIndex
;
123 vid
= provokingVertex
;
127 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
129 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
131 _mm_store_ps(pBuffer
, attrib
[vid
]);
137 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
139 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
141 _mm_store_ps(pBuffer
, attrib
[i
]);
148 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
150 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
152 _mm_store_ps(pBuffer
, attrib
[i
]);
157 // pad out the attrib buffer to 3 verts to ensure the triangle
158 // interpolation code in the pixel shader works correctly for the
159 // 3 topologies - point, line, tri. This effectively zeros out the
160 // effect of the missing vertices in the triangle interpolation.
161 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
163 _mm_store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
167 // check for constant source overrides
168 if (IsSwizzledT::value
)
170 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
174 while (_BitScanForward(&comp
, mask
))
176 mask
&= ~(1 << comp
);
178 float constantValue
= 0.0f
;
179 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
181 case SWR_CONSTANT_SOURCE_CONST_0000
:
182 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
183 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
184 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
186 case SWR_CONSTANT_SOURCE_PRIM_ID
:
187 constantValue
= *(float*)&primId
;
191 // apply constant value to all 3 vertices
192 for (uint32_t v
= 0; v
< 3; ++v
)
194 pAttribStart
[comp
+ v
* 4] = constantValue
;
202 //////////////////////////////////////////////////////////////////////////
203 /// @brief Gather scissor rect data based on per-prim viewport indices.
204 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
205 /// @param pViewportIndex - array of per-primitive vewport indexes.
206 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
207 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
208 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
209 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
211 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
212 template<size_t SimdWidth
>
213 struct GatherScissors
215 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
216 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
217 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
219 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
224 struct GatherScissors
<8>
226 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
227 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
228 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
230 scisXmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
231 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
232 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
233 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
234 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
235 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
236 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
237 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
);
238 scisYmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
239 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
240 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
241 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
242 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
243 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
244 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
245 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
);
246 scisXmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
247 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
248 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
249 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
250 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
251 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
252 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
253 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
);
254 scisYmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
255 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
256 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
257 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
258 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
259 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
260 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
261 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
);
265 #if USE_SIMD16_FRONTEND
266 template<size_t SimdWidth
>
267 struct GatherScissors_simd16
269 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
270 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
271 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
273 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
278 struct GatherScissors_simd16
<16>
280 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
281 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
282 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
284 scisXmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
285 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
286 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
287 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
288 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
289 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
290 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
291 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
,
292 pScissorsInFixedPoint
[pViewportIndex
[8]].xmin
,
293 pScissorsInFixedPoint
[pViewportIndex
[9]].xmin
,
294 pScissorsInFixedPoint
[pViewportIndex
[10]].xmin
,
295 pScissorsInFixedPoint
[pViewportIndex
[11]].xmin
,
296 pScissorsInFixedPoint
[pViewportIndex
[12]].xmin
,
297 pScissorsInFixedPoint
[pViewportIndex
[13]].xmin
,
298 pScissorsInFixedPoint
[pViewportIndex
[14]].xmin
,
299 pScissorsInFixedPoint
[pViewportIndex
[15]].xmin
);
301 scisYmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
302 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
303 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
304 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
305 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
306 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
307 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
308 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
,
309 pScissorsInFixedPoint
[pViewportIndex
[8]].ymin
,
310 pScissorsInFixedPoint
[pViewportIndex
[9]].ymin
,
311 pScissorsInFixedPoint
[pViewportIndex
[10]].ymin
,
312 pScissorsInFixedPoint
[pViewportIndex
[11]].ymin
,
313 pScissorsInFixedPoint
[pViewportIndex
[12]].ymin
,
314 pScissorsInFixedPoint
[pViewportIndex
[13]].ymin
,
315 pScissorsInFixedPoint
[pViewportIndex
[14]].ymin
,
316 pScissorsInFixedPoint
[pViewportIndex
[15]].ymin
);
318 scisXmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
319 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
320 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
321 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
322 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
323 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
324 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
325 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
,
326 pScissorsInFixedPoint
[pViewportIndex
[8]].xmax
,
327 pScissorsInFixedPoint
[pViewportIndex
[9]].xmax
,
328 pScissorsInFixedPoint
[pViewportIndex
[10]].xmax
,
329 pScissorsInFixedPoint
[pViewportIndex
[11]].xmax
,
330 pScissorsInFixedPoint
[pViewportIndex
[12]].xmax
,
331 pScissorsInFixedPoint
[pViewportIndex
[13]].xmax
,
332 pScissorsInFixedPoint
[pViewportIndex
[14]].xmax
,
333 pScissorsInFixedPoint
[pViewportIndex
[15]].xmax
);
335 scisYmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
336 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
337 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
338 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
339 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
340 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
341 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
342 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
,
343 pScissorsInFixedPoint
[pViewportIndex
[8]].ymax
,
344 pScissorsInFixedPoint
[pViewportIndex
[9]].ymax
,
345 pScissorsInFixedPoint
[pViewportIndex
[10]].ymax
,
346 pScissorsInFixedPoint
[pViewportIndex
[11]].ymax
,
347 pScissorsInFixedPoint
[pViewportIndex
[12]].ymax
,
348 pScissorsInFixedPoint
[pViewportIndex
[13]].ymax
,
349 pScissorsInFixedPoint
[pViewportIndex
[14]].ymax
,
350 pScissorsInFixedPoint
[pViewportIndex
[15]].ymax
);
355 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
357 struct ProcessAttributesChooser
359 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
361 template <typename
... ArgsB
>
362 static FuncType
GetFunc()
364 return ProcessAttributes
<ArgsB
...>;
368 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
370 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
373 //////////////////////////////////////////////////////////////////////////
374 /// @brief Processes enabled user clip distances. Loads the active clip
375 /// distances from the PA, sets up barycentric equations, and
376 /// stores the results to the output buffer
377 /// @param pa - Primitive Assembly state
378 /// @param primIndex - primitive index to process
379 /// @param clipDistMask - mask of enabled clip distances
380 /// @param pUserClipBuffer - buffer to store results
381 template<uint32_t NumVerts
>
382 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float *pRecipW
, float* pUserClipBuffer
)
385 while (_BitScanForward(&clipDist
, clipDistMask
))
387 clipDistMask
&= ~(1 << clipDist
);
388 uint32_t clipSlot
= clipDist
>> 2;
389 uint32_t clipComp
= clipDist
& 0x3;
390 uint32_t clipAttribSlot
= clipSlot
== 0 ?
391 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
393 __m128 primClipDist
[3];
394 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
396 float vertClipDist
[NumVerts
];
397 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
399 OSALIGNSIMD(float) aVertClipDist
[4];
400 _mm_store_ps(aVertClipDist
, primClipDist
[e
]);
401 vertClipDist
[e
] = aVertClipDist
[clipComp
];
404 // setup plane equations for barycentric interpolation in the backend
405 float baryCoeff
[NumVerts
];
406 float last
= vertClipDist
[NumVerts
- 1] * pRecipW
[NumVerts
- 1];
407 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
409 baryCoeff
[e
] = vertClipDist
[e
] * pRecipW
[e
] - last
;
411 baryCoeff
[NumVerts
- 1] = last
;
413 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
415 *(pUserClipBuffer
++) = baryCoeff
[e
];
420 //////////////////////////////////////////////////////////////////////////
421 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
422 /// culling, viewport transform, etc.
423 /// @param pDC - pointer to draw context.
424 /// @param pa - The primitive assembly object.
425 /// @param workerId - thread's worker id. Even thread has a unique id.
426 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
427 /// @param primID - Primitive ID for each triangle.
428 /// @param viewportIdx - viewport array index for each triangle.
429 /// @tparam CT - ConservativeRastFETraits
430 template <typename CT
>
438 simdscalari viewportIdx
)
440 SWR_CONTEXT
*pContext
= pDC
->pContext
;
442 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
444 const API_STATE
& state
= GetApiState(pDC
);
445 const SWR_RASTSTATE
& rastState
= state
.rastState
;
446 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
447 const SWR_GS_STATE
& gsState
= state
.gsState
;
448 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
450 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
451 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
452 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
454 if (feState
.vpTransformDisable
)
456 // RHW is passed in directly when VP transform is disabled
457 vRecipW0
= tri
[0].v
[3];
458 vRecipW1
= tri
[1].v
[3];
459 vRecipW2
= tri
[2].v
[3];
463 // Perspective divide
464 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
465 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
466 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
468 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
469 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
470 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
472 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
473 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
474 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
476 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
477 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
478 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
480 // Viewport transform to screen space coords
481 if (state
.gsState
.emitsViewportArrayIndex
)
483 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
487 viewportTransform
<3>(tri
, state
.vpMatrices
);
491 // Adjust for pixel center location
492 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
493 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
494 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
496 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
497 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
499 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
500 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
502 simdscalari vXi
[3], vYi
[3];
503 // Set vXi, vYi to required fixed point precision
504 FPToFixedPoint(tri
, vXi
, vYi
);
507 simdscalari vAi
[3], vBi
[3];
508 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
512 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
515 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
516 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
518 int cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
520 uint32_t origTriMask
= triMask
;
521 // don't cull degenerate triangles if we're conservatively rasterizing
522 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
524 triMask
&= ~cullZeroAreaMask
;
527 // determine front winding tris
530 // 0 area triangles are marked as backfacing regardless of winding order,
531 // which is required behavior for conservative rast and wireframe rendering
532 uint32_t frontWindingTris
;
533 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
535 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
536 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
540 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[0])));
541 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[1])));
543 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
547 switch ((SWR_CULLMODE
)rastState
.cullMode
)
549 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
550 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
551 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
552 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
553 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
554 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
557 triMask
&= ~cullTris
;
559 if (origTriMask
^ triMask
)
561 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
564 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
565 // compute per tri backface
566 uint32_t frontFaceMask
= frontWindingTris
;
567 uint32_t *pPrimID
= (uint32_t *)&primID
;
568 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
571 PFN_WORK_FUNC pfnWork
;
572 if (CT::IsConservativeT::value
)
574 // determine which edges of the degenerate tri, if any, are valid to rasterize.
575 // used to call the appropriate templated rasterizer function
576 if (cullZeroAreaMask
> 0)
579 simdscalari x0x1Mask
= _simd_cmpeq_epi32(vXi
[0], vXi
[1]);
580 simdscalari y0y1Mask
= _simd_cmpeq_epi32(vYi
[0], vYi
[1]);
581 uint32_t e0Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask
, y0y1Mask
)));
584 simdscalari x1x2Mask
= _simd_cmpeq_epi32(vXi
[1], vXi
[2]);
585 simdscalari y1y2Mask
= _simd_cmpeq_epi32(vYi
[1], vYi
[2]);
586 uint32_t e1Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask
, y1y2Mask
)));
589 // if v0 == v1 & v1 == v2, v0 == v2
590 uint32_t e2Mask
= e0Mask
& e1Mask
;
591 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
593 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
594 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
595 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
596 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
597 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
598 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
599 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
601 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
605 edgeEnable
= 0x00FFFFFF;
610 // degenerate triangles won't be sent to rasterizer; just enable all edges
611 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
612 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
617 goto endBinTriangles
;
620 // Calc bounding box of triangles
622 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
624 // determine if triangle falls between pixel centers and discard
625 // only discard for non-MSAA case and when conservative rast is disabled
626 // (xmin + 127) & ~255
627 // (xmax + 128) & ~255
628 if((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
629 (!CT::IsConservativeT::value
))
631 origTriMask
= triMask
;
635 simdscalari xmin
= _simd_add_epi32(bbox
.xmin
, _simd_set1_epi32(127));
636 xmin
= _simd_and_si(xmin
, _simd_set1_epi32(~255));
637 simdscalari xmax
= _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(128));
638 xmax
= _simd_and_si(xmax
, _simd_set1_epi32(~255));
640 simdscalari vMaskH
= _simd_cmpeq_epi32(xmin
, xmax
);
642 simdscalari ymin
= _simd_add_epi32(bbox
.ymin
, _simd_set1_epi32(127));
643 ymin
= _simd_and_si(ymin
, _simd_set1_epi32(~255));
644 simdscalari ymax
= _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(128));
645 ymax
= _simd_and_si(ymax
, _simd_set1_epi32(~255));
647 simdscalari vMaskV
= _simd_cmpeq_epi32(ymin
, ymax
);
648 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
649 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
652 triMask
&= ~cullCenterMask
;
654 if (origTriMask
^ triMask
)
656 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
660 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
661 // Gather the AOS effective scissor rects based on the per-prim VP index.
662 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
663 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
664 if (state
.gsState
.emitsViewportArrayIndex
)
666 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
667 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
669 else // broadcast fast path for non-VPAI case.
671 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
672 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
673 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
674 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
677 // Make triangle bbox inclusive
678 bbox
.xmax
= _simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1));
679 bbox
.ymax
= _simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1));
681 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
682 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
683 bbox
.xmax
= _simd_min_epi32(bbox
.xmax
, scisXmax
);
684 bbox
.ymax
= _simd_min_epi32(bbox
.ymax
, scisYmax
);
686 if (CT::IsConservativeT::value
)
688 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
689 // some area. Bump the xmax/ymax edges out
690 simdscalari topEqualsBottom
= _simd_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
691 bbox
.ymax
= _simd_blendv_epi32(bbox
.ymax
, _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(1)), topEqualsBottom
);
692 simdscalari leftEqualsRight
= _simd_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
693 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(1)), leftEqualsRight
);
696 // Cull tris completely outside scissor
698 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
699 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
700 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
701 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
702 triMask
= triMask
& ~maskOutsideScissor
;
707 // Send surviving triangles to the line or point binner based on fill mode
708 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
710 // Simple non-conformant wireframe mode, useful for debugging.
711 // Construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
713 simdscalar recipW
[2];
716 recipW
[0] = vRecipW0
;
717 recipW
[1] = vRecipW1
;
718 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
722 recipW
[0] = vRecipW1
;
723 recipW
[1] = vRecipW2
;
724 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
728 recipW
[0] = vRecipW2
;
729 recipW
[1] = vRecipW0
;
730 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
732 AR_END(FEBinTriangles
, 1);
735 else if (rastState
.fillMode
== SWR_FILLMODE_POINT
)
738 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[0], triMask
, primID
, viewportIdx
);
739 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[1], triMask
, primID
, viewportIdx
);
740 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[2], triMask
, primID
, viewportIdx
);
744 // Convert triangle bbox to macrotile units.
745 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
746 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
747 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
748 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
750 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
751 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
752 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
753 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
754 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
756 // transpose verts needed for backend
757 /// @todo modify BE to take non-transformed verts
758 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
759 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
760 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
761 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
762 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
764 // store render target array index
765 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
766 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
769 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
771 vRtaii
= _simd_castps_si(vRtai
[0].x
);
772 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
776 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
779 // scan remaining valid triangles and bin each separately
780 while (_BitScanForward(&triIndex
, triMask
))
782 uint32_t linkageCount
= state
.backendState
.numAttributes
;
783 uint32_t numScalarAttribs
= linkageCount
* 4;
789 if (CT::IsConservativeT::value
)
791 // only rasterize valid edges if we have a degenerate primitive
792 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
793 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
794 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
796 // Degenerate triangles are required to be constant interpolated
797 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
801 isDegenerate
= false;
802 work
.pfnWork
= pfnWork
;
805 // Select attribute processor
806 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
807 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
809 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
811 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
812 desc
.triFlags
.primID
= pPrimID
[triIndex
];
813 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
814 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
816 auto pArena
= pDC
->pArena
;
817 SWR_ASSERT(pArena
!= nullptr);
819 // store active attribs
820 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
821 desc
.pAttribs
= pAttribs
;
822 desc
.numAttribs
= linkageCount
;
823 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
825 // store triangle vertex data
826 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
828 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
829 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
830 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
831 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
833 // store user clip distances
834 if (rastState
.clipDistanceMask
)
836 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
837 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
838 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
841 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
843 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
845 #if KNOB_ENABLE_TOSS_POINTS
846 if (!KNOB_TOSS_SETUP_TRIS
)
849 pTileMgr
->enqueue(x
, y
, &work
);
853 triMask
&= ~(1 << triIndex
);
856 AR_END(FEBinTriangles
, 1);
859 #if USE_SIMD16_FRONTEND
860 template <typename CT
>
861 void SIMDAPI
BinTriangles_simd16(
867 simd16scalari primID
,
868 simd16scalari viewportIdx
)
870 SWR_CONTEXT
*pContext
= pDC
->pContext
;
872 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
874 const API_STATE
& state
= GetApiState(pDC
);
875 const SWR_RASTSTATE
& rastState
= state
.rastState
;
876 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
877 const SWR_GS_STATE
& gsState
= state
.gsState
;
879 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
881 simd16scalar vRecipW0
= _simd16_set1_ps(1.0f
);
882 simd16scalar vRecipW1
= _simd16_set1_ps(1.0f
);
883 simd16scalar vRecipW2
= _simd16_set1_ps(1.0f
);
885 if (feState
.vpTransformDisable
)
887 // RHW is passed in directly when VP transform is disabled
888 vRecipW0
= tri
[0].v
[3];
889 vRecipW1
= tri
[1].v
[3];
890 vRecipW2
= tri
[2].v
[3];
894 // Perspective divide
895 vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[0].w
);
896 vRecipW1
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[1].w
);
897 vRecipW2
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[2].w
);
899 tri
[0].v
[0] = _simd16_mul_ps(tri
[0].v
[0], vRecipW0
);
900 tri
[1].v
[0] = _simd16_mul_ps(tri
[1].v
[0], vRecipW1
);
901 tri
[2].v
[0] = _simd16_mul_ps(tri
[2].v
[0], vRecipW2
);
903 tri
[0].v
[1] = _simd16_mul_ps(tri
[0].v
[1], vRecipW0
);
904 tri
[1].v
[1] = _simd16_mul_ps(tri
[1].v
[1], vRecipW1
);
905 tri
[2].v
[1] = _simd16_mul_ps(tri
[2].v
[1], vRecipW2
);
907 tri
[0].v
[2] = _simd16_mul_ps(tri
[0].v
[2], vRecipW0
);
908 tri
[1].v
[2] = _simd16_mul_ps(tri
[1].v
[2], vRecipW1
);
909 tri
[2].v
[2] = _simd16_mul_ps(tri
[2].v
[2], vRecipW2
);
911 // Viewport transform to screen space coords
912 if (state
.gsState
.emitsViewportArrayIndex
)
914 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
918 viewportTransform
<3>(tri
, state
.vpMatrices
);
922 // Adjust for pixel center location
923 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
925 tri
[0].x
= _simd16_add_ps(tri
[0].x
, offset
);
926 tri
[0].y
= _simd16_add_ps(tri
[0].y
, offset
);
928 tri
[1].x
= _simd16_add_ps(tri
[1].x
, offset
);
929 tri
[1].y
= _simd16_add_ps(tri
[1].y
, offset
);
931 tri
[2].x
= _simd16_add_ps(tri
[2].x
, offset
);
932 tri
[2].y
= _simd16_add_ps(tri
[2].y
, offset
);
934 simd16scalari vXi
[3], vYi
[3];
936 // Set vXi, vYi to required fixed point precision
937 FPToFixedPoint(tri
, vXi
, vYi
);
940 simd16scalari vAi
[3], vBi
[3];
941 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
944 simd16scalari vDet
[2];
945 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
948 uint32_t maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[0], _simd16_setzero_si())));
949 uint32_t maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[1], _simd16_setzero_si())));
951 uint32_t cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
953 // don't cull degenerate triangles if we're conservatively rasterizing
954 uint32_t origTriMask
= triMask
;
955 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
957 triMask
&= ~cullZeroAreaMask
;
960 // determine front winding tris
963 // 0 area triangles are marked as backfacing regardless of winding order,
964 // which is required behavior for conservative rast and wireframe rendering
965 uint32_t frontWindingTris
;
966 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
968 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[0], _simd16_setzero_si())));
969 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[1], _simd16_setzero_si())));
973 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[0])));
974 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[1])));
976 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
980 switch ((SWR_CULLMODE
)rastState
.cullMode
)
982 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
983 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
984 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
985 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
986 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
987 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
990 triMask
&= ~cullTris
;
992 if (origTriMask
^ triMask
)
994 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
997 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
998 // compute per tri backface
999 uint32_t frontFaceMask
= frontWindingTris
;
1000 uint32_t *pPrimID
= (uint32_t *)&primID
;
1001 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1004 uint32_t edgeEnable
;
1005 PFN_WORK_FUNC pfnWork
;
1006 if (CT::IsConservativeT::value
)
1008 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1009 // used to call the appropriate templated rasterizer function
1010 if (cullZeroAreaMask
> 0)
1013 const simd16scalari x0x1Mask
= _simd16_cmpeq_epi32(vXi
[0], vXi
[1]);
1014 const simd16scalari y0y1Mask
= _simd16_cmpeq_epi32(vYi
[0], vYi
[1]);
1016 uint32_t e0Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask
, y0y1Mask
)));
1019 const simd16scalari x1x2Mask
= _simd16_cmpeq_epi32(vXi
[1], vXi
[2]);
1020 const simd16scalari y1y2Mask
= _simd16_cmpeq_epi32(vYi
[1], vYi
[2]);
1022 uint32_t e1Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask
, y1y2Mask
)));
1025 // if v0 == v1 & v1 == v2, v0 == v2
1026 uint32_t e2Mask
= e0Mask
& e1Mask
;
1027 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
1029 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1030 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1031 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
1033 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1034 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
1036 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1037 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
1039 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
1043 edgeEnable
= 0x00FFFFFF;
1048 // degenerate triangles won't be sent to rasterizer; just enable all edges
1049 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1050 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
1055 goto endBinTriangles
;
1058 // Calc bounding box of triangles
1060 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
1062 // determine if triangle falls between pixel centers and discard
1063 // only discard for non-MSAA case and when conservative rast is disabled
1064 // (xmin + 127) & ~255
1065 // (xmax + 128) & ~255
1066 if ((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
1067 (!CT::IsConservativeT::value
))
1069 origTriMask
= triMask
;
1074 simd16scalari xmin
= _simd16_add_epi32(bbox
.xmin
, _simd16_set1_epi32(127));
1075 xmin
= _simd16_and_si(xmin
, _simd16_set1_epi32(~255));
1076 simd16scalari xmax
= _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(128));
1077 xmax
= _simd16_and_si(xmax
, _simd16_set1_epi32(~255));
1079 simd16scalari vMaskH
= _simd16_cmpeq_epi32(xmin
, xmax
);
1081 simd16scalari ymin
= _simd16_add_epi32(bbox
.ymin
, _simd16_set1_epi32(127));
1082 ymin
= _simd16_and_si(ymin
, _simd16_set1_epi32(~255));
1083 simd16scalari ymax
= _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(128));
1084 ymax
= _simd16_and_si(ymax
, _simd16_set1_epi32(~255));
1086 simd16scalari vMaskV
= _simd16_cmpeq_epi32(ymin
, ymax
);
1088 vMaskV
= _simd16_or_si(vMaskH
, vMaskV
);
1089 cullCenterMask
= _simd16_movemask_ps(_simd16_castsi_ps(vMaskV
));
1092 triMask
&= ~cullCenterMask
;
1094 if (origTriMask
^ triMask
)
1096 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1100 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1101 // Gather the AOS effective scissor rects based on the per-prim VP index.
1102 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1103 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1105 if (state
.gsState
.emitsViewportArrayIndex
)
1107 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1108 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1110 else // broadcast fast path for non-VPAI case.
1112 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1113 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1114 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1115 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1118 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
1119 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
1120 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
1121 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
1123 if (CT::IsConservativeT::value
)
1125 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1126 // some area. Bump the xmax/ymax edges out
1127 simd16scalari topEqualsBottom
= _simd16_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
1128 bbox
.ymax
= _simd16_blendv_epi32(bbox
.ymax
, _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), topEqualsBottom
);
1129 simd16scalari leftEqualsRight
= _simd16_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
1130 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), leftEqualsRight
);
1133 // Cull tris completely outside scissor
1135 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1136 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1137 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1138 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
1139 triMask
= triMask
& ~maskOutsideScissor
;
1144 // Send surviving triangles to the line or point binner based on fill mode
1145 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
1147 // Simple non-conformant wireframe mode, useful for debugging
1148 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1149 simd16vector line
[2];
1150 simd16scalar recipW
[2];
1153 recipW
[0] = vRecipW0
;
1154 recipW
[1] = vRecipW1
;
1155 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1159 recipW
[0] = vRecipW1
;
1160 recipW
[1] = vRecipW2
;
1161 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1165 recipW
[0] = vRecipW2
;
1166 recipW
[1] = vRecipW0
;
1167 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1169 AR_END(FEBinTriangles
, 1);
1172 else if (rastState
.fillMode
== SWR_FILLMODE_POINT
)
1175 BinPostSetupPoints_simd16(pDC
, pa
, workerId
, &tri
[0], triMask
, primID
, viewportIdx
);
1176 BinPostSetupPoints_simd16(pDC
, pa
, workerId
, &tri
[1], triMask
, primID
, viewportIdx
);
1177 BinPostSetupPoints_simd16(pDC
, pa
, workerId
, &tri
[2], triMask
, primID
, viewportIdx
);
1181 // Convert triangle bbox to macrotile units.
1182 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1183 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1184 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1185 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1187 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
1189 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
1190 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
1191 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
1192 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
1194 // transpose verts needed for backend
1195 /// @todo modify BE to take non-transformed verts
1196 __m128 vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1197 __m128 vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1198 __m128 vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1199 __m128 vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1201 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(tri
[0].x
, 0), _simd16_extract_ps(tri
[1].x
, 0), _simd16_extract_ps(tri
[2].x
, 0));
1202 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(tri
[0].y
, 0), _simd16_extract_ps(tri
[1].y
, 0), _simd16_extract_ps(tri
[2].y
, 0));
1203 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(tri
[0].z
, 0), _simd16_extract_ps(tri
[1].z
, 0), _simd16_extract_ps(tri
[2].z
, 0));
1204 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), _simd16_extract_ps(vRecipW2
, 0));
1206 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(tri
[0].x
, 1), _simd16_extract_ps(tri
[1].x
, 1), _simd16_extract_ps(tri
[2].x
, 1));
1207 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(tri
[0].y
, 1), _simd16_extract_ps(tri
[1].y
, 1), _simd16_extract_ps(tri
[2].y
, 1));
1208 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(tri
[0].z
, 1), _simd16_extract_ps(tri
[1].z
, 1), _simd16_extract_ps(tri
[2].z
, 1));
1209 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), _simd16_extract_ps(vRecipW2
, 1));
1211 // store render target array index
1212 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1213 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1215 simd16vector vRtai
[3];
1216 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
1217 simd16scalari vRtaii
;
1218 vRtaii
= _simd16_castps_si(vRtai
[0].x
);
1219 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1223 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1227 // scan remaining valid triangles and bin each separately
1228 while (_BitScanForward(&triIndex
, triMask
))
1230 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1231 uint32_t numScalarAttribs
= linkageCount
* 4;
1237 if (CT::IsConservativeT::value
)
1239 // only rasterize valid edges if we have a degenerate primitive
1240 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
1241 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1242 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
1244 // Degenerate triangles are required to be constant interpolated
1245 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
1249 isDegenerate
= false;
1250 work
.pfnWork
= pfnWork
;
1253 // Select attribute processor
1254 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
1255 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
1257 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1259 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
1260 desc
.triFlags
.primID
= pPrimID
[triIndex
];
1261 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
1262 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
1264 auto pArena
= pDC
->pArena
;
1265 SWR_ASSERT(pArena
!= nullptr);
1267 // store active attribs
1268 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1269 desc
.pAttribs
= pAttribs
;
1270 desc
.numAttribs
= linkageCount
;
1271 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
1273 // store triangle vertex data
1274 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1277 const uint32_t i
= triIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
1278 const uint32_t j
= triIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
1280 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
1281 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
1282 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
1283 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
1286 // store user clip distances
1287 if (rastState
.clipDistanceMask
)
1289 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1290 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1291 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1294 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
1296 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
1298 #if KNOB_ENABLE_TOSS_POINTS
1299 if (!KNOB_TOSS_SETUP_TRIS
)
1302 pTileMgr
->enqueue(x
, y
, &work
);
1307 triMask
&= ~(1 << triIndex
);
1310 AR_END(FEBinTriangles
, 1);
1314 struct FEBinTrianglesChooser
1316 typedef PFN_PROCESS_PRIMS FuncType
;
1318 template <typename
... ArgsB
>
1319 static FuncType
GetFunc()
1321 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
1325 // Selector for correct templated BinTrinagles function
1326 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
1328 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
1331 #if USE_SIMD16_FRONTEND
1332 struct FEBinTrianglesChooser_simd16
1334 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType
;
1336 template <typename
... ArgsB
>
1337 static FuncType
GetFunc()
1339 return BinTriangles_simd16
<ConservativeRastFETraits
<ArgsB
...>>;
1343 // Selector for correct templated BinTrinagles function
1344 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
)
1346 return TemplateArgUnroller
<FEBinTrianglesChooser_simd16
>::GetFunc(IsConservative
);
1351 void BinPostSetupPoints(
1358 simdscalari viewportIdx
)
1360 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1362 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1364 simdvector
& primVerts
= prim
[0];
1366 const API_STATE
& state
= GetApiState(pDC
);
1367 const SWR_GS_STATE
& gsState
= state
.gsState
;
1368 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1369 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1371 // Select attribute processor
1372 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1373 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1375 // convert to fixed point
1376 simdscalari vXi
, vYi
;
1377 vXi
= fpToFixedPointVertical(primVerts
.x
);
1378 vYi
= fpToFixedPointVertical(primVerts
.y
);
1380 if (CanUseSimplePoints(pDC
))
1382 // adjust for ymin-xmin rule
1383 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
1384 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
1386 // cull points off the ymin-xmin edge of the viewport
1387 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
1388 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
1390 // compute macro tile coordinates
1391 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1392 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1394 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
1395 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
1396 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
1398 // compute raster tile coordinates
1399 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1400 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1402 // compute raster tile relative x,y for coverage mask
1403 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1404 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1406 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1407 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1409 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
1410 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
1411 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
1412 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
1414 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
1415 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
1416 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
1417 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
1419 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
1420 _simd_store_ps((float*)aZ
, primVerts
.z
);
1422 // store render target array index
1423 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1424 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1427 pa
.Assemble(VERTEX_RTAI_SLOT
, &vRtai
);
1428 simdscalari vRtaii
= _simd_castps_si(vRtai
.x
);
1429 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1433 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1436 uint32_t *pPrimID
= (uint32_t *)&primID
;
1437 DWORD primIndex
= 0;
1439 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1441 // scan remaining valid triangles and bin each separately
1442 while (_BitScanForward(&primIndex
, primMask
))
1444 uint32_t linkageCount
= backendState
.numAttributes
;
1445 uint32_t numScalarAttribs
= linkageCount
* 4;
1450 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1452 // points are always front facing
1453 desc
.triFlags
.frontFacing
= 1;
1454 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1455 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1456 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1458 work
.pfnWork
= RasterizeSimplePoint
;
1460 auto pArena
= pDC
->pArena
;
1461 SWR_ASSERT(pArena
!= nullptr);
1464 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1465 desc
.pAttribs
= pAttribs
;
1466 desc
.numAttribs
= linkageCount
;
1468 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1470 // store raster tile aligned x, y, perspective correct z
1471 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1472 desc
.pTriBuffer
= pTriBuffer
;
1473 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1474 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1475 *pTriBuffer
= aZ
[primIndex
];
1477 uint32_t tX
= aTileRelativeX
[primIndex
];
1478 uint32_t tY
= aTileRelativeY
[primIndex
];
1480 // pack the relative x,y into the coverageMask, the rasterizer will
1481 // generate the true coverage mask from it
1482 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1485 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1486 #if KNOB_ENABLE_TOSS_POINTS
1487 if (!KNOB_TOSS_SETUP_TRIS
)
1490 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1492 primMask
&= ~(1 << primIndex
);
1497 // non simple points need to be potentially binned to multiple macro tiles
1498 simdscalar vPointSize
;
1499 if (rastState
.pointParam
)
1502 pa
.Assemble(VERTEX_POINT_SIZE_SLOT
, size
);
1503 vPointSize
= size
[0].x
;
1507 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
1510 // bloat point to bbox
1512 bbox
.xmin
= bbox
.xmax
= vXi
;
1513 bbox
.ymin
= bbox
.ymax
= vYi
;
1515 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
1516 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1517 bbox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1518 bbox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
1519 bbox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1520 bbox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
1522 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1523 // Gather the AOS effective scissor rects based on the per-prim VP index.
1524 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1525 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1526 if (state
.gsState
.emitsViewportArrayIndex
)
1528 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1529 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1531 else // broadcast fast path for non-VPAI case.
1533 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1534 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1535 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1536 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1539 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
1540 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
1541 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
1542 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
1544 // Cull bloated points completely outside scissor
1545 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1546 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1547 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1548 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1549 primMask
= primMask
& ~maskOutsideScissor
;
1551 // Convert bbox to macrotile units.
1552 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1553 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1554 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1555 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1557 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1558 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
1559 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
1560 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
1561 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
1563 // store render target array index
1564 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1565 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1567 simdvector vRtai
[2];
1568 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
1569 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
1570 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1574 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1577 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
1578 _simd_store_ps((float*)aPointSize
, vPointSize
);
1580 uint32_t *pPrimID
= (uint32_t *)&primID
;
1582 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
1583 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
1584 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
1586 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
1587 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
1588 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
1590 // scan remaining valid prims and bin each separately
1591 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1593 while (_BitScanForward(&primIndex
, primMask
))
1595 uint32_t linkageCount
= backendState
.numAttributes
;
1596 uint32_t numScalarAttribs
= linkageCount
* 4;
1601 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1603 desc
.triFlags
.frontFacing
= 1;
1604 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1605 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1606 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1607 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1609 work
.pfnWork
= RasterizeTriPoint
;
1611 auto pArena
= pDC
->pArena
;
1612 SWR_ASSERT(pArena
!= nullptr);
1614 // store active attribs
1615 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1616 desc
.numAttribs
= linkageCount
;
1617 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1619 // store point vertex data
1620 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1621 desc
.pTriBuffer
= pTriBuffer
;
1622 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
1623 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
1624 *pTriBuffer
= aPrimVertsZ
[primIndex
];
1626 // store user clip distances
1627 if (rastState
.clipDistanceMask
)
1629 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1630 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1633 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
1634 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
1635 desc
.pUserClipBuffer
[3*i
+ 0] = 0.0f
;
1636 desc
.pUserClipBuffer
[3*i
+ 1] = 0.0f
;
1637 desc
.pUserClipBuffer
[3*i
+ 2] = dists
[i
];
1641 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1642 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1644 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1646 #if KNOB_ENABLE_TOSS_POINTS
1647 if (!KNOB_TOSS_SETUP_TRIS
)
1650 pTileMgr
->enqueue(x
, y
, &work
);
1655 primMask
&= ~(1 << primIndex
);
1659 AR_END(FEBinPoints
, 1);
1662 //////////////////////////////////////////////////////////////////////////
1663 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1664 /// @param pDC - pointer to draw context.
1665 /// @param pa - The primitive assembly object.
1666 /// @param workerId - thread's worker id. Even thread has a unique id.
1667 /// @param tri - Contains point position data for SIMDs worth of points.
1668 /// @param primID - Primitive ID for each point.
1676 simdscalari viewportIdx
)
1678 simdvector
& primVerts
= prim
[0];
1680 const API_STATE
& state
= GetApiState(pDC
);
1681 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1682 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1684 if (!feState
.vpTransformDisable
)
1686 // perspective divide
1687 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
1688 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
1689 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
1690 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
1692 // viewport transform to screen coords
1693 if (state
.gsState
.emitsViewportArrayIndex
)
1695 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
1699 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
1703 // adjust for pixel center location
1704 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1705 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
1706 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
1718 #if USE_SIMD16_FRONTEND
1719 void BinPostSetupPoints_simd16(
1723 simd16vector prim
[],
1725 simd16scalari primID
,
1726 simd16scalari viewportIdx
)
1728 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1730 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1732 simd16vector
& primVerts
= prim
[0];
1734 const API_STATE
& state
= GetApiState(pDC
);
1735 const SWR_GS_STATE
& gsState
= state
.gsState
;
1736 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1737 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1739 // Select attribute processor
1740 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1741 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1743 // convert to fixed point
1744 simd16scalari vXi
, vYi
;
1746 vXi
= fpToFixedPointVertical(primVerts
.x
);
1747 vYi
= fpToFixedPointVertical(primVerts
.y
);
1749 if (CanUseSimplePoints(pDC
))
1751 // adjust for ymin-xmin rule
1752 vXi
= _simd16_sub_epi32(vXi
, _simd16_set1_epi32(1));
1753 vYi
= _simd16_sub_epi32(vYi
, _simd16_set1_epi32(1));
1755 // cull points off the ymin-xmin edge of the viewport
1756 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi
));
1757 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi
));
1759 // compute macro tile coordinates
1760 simd16scalari macroX
= _simd16_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1761 simd16scalari macroY
= _simd16_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1763 OSALIGNSIMD16(uint32_t) aMacroX
[KNOB_SIMD16_WIDTH
], aMacroY
[KNOB_SIMD16_WIDTH
];
1765 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroX
), macroX
);
1766 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroY
), macroY
);
1768 // compute raster tile coordinates
1769 simd16scalari rasterX
= _simd16_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1770 simd16scalari rasterY
= _simd16_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1772 // compute raster tile relative x,y for coverage mask
1773 simd16scalari tileAlignedX
= _simd16_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1774 simd16scalari tileAlignedY
= _simd16_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1776 simd16scalari tileRelativeX
= _simd16_sub_epi32(_simd16_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1777 simd16scalari tileRelativeY
= _simd16_sub_epi32(_simd16_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1779 OSALIGNSIMD16(uint32_t) aTileRelativeX
[KNOB_SIMD16_WIDTH
];
1780 OSALIGNSIMD16(uint32_t) aTileRelativeY
[KNOB_SIMD16_WIDTH
];
1782 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeX
), tileRelativeX
);
1783 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeY
), tileRelativeY
);
1785 OSALIGNSIMD16(uint32_t) aTileAlignedX
[KNOB_SIMD16_WIDTH
];
1786 OSALIGNSIMD16(uint32_t) aTileAlignedY
[KNOB_SIMD16_WIDTH
];
1788 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedX
), tileAlignedX
);
1789 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedY
), tileAlignedY
);
1791 OSALIGNSIMD16(float) aZ
[KNOB_SIMD16_WIDTH
];
1792 _simd16_store_ps(reinterpret_cast<float *>(aZ
), primVerts
.z
);
1794 // store render target array index
1795 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1796 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1799 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, &vRtai
);
1800 simd16scalari vRtaii
= _simd16_castps_si(vRtai
.x
);
1801 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1805 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1808 uint32_t *pPrimID
= (uint32_t *)&primID
;
1809 DWORD primIndex
= 0;
1811 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1813 // scan remaining valid triangles and bin each separately
1814 while (_BitScanForward(&primIndex
, primMask
))
1816 uint32_t linkageCount
= backendState
.numAttributes
;
1817 uint32_t numScalarAttribs
= linkageCount
* 4;
1822 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1824 // points are always front facing
1825 desc
.triFlags
.frontFacing
= 1;
1826 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1827 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1828 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1830 work
.pfnWork
= RasterizeSimplePoint
;
1832 auto pArena
= pDC
->pArena
;
1833 SWR_ASSERT(pArena
!= nullptr);
1836 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1837 desc
.pAttribs
= pAttribs
;
1838 desc
.numAttribs
= linkageCount
;
1840 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1842 // store raster tile aligned x, y, perspective correct z
1843 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1844 desc
.pTriBuffer
= pTriBuffer
;
1845 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1846 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1847 *pTriBuffer
= aZ
[primIndex
];
1849 uint32_t tX
= aTileRelativeX
[primIndex
];
1850 uint32_t tY
= aTileRelativeY
[primIndex
];
1852 // pack the relative x,y into the coverageMask, the rasterizer will
1853 // generate the true coverage mask from it
1854 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1857 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1858 #if KNOB_ENABLE_TOSS_POINTS
1859 if (!KNOB_TOSS_SETUP_TRIS
)
1862 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1865 primMask
&= ~(1 << primIndex
);
1870 // non simple points need to be potentially binned to multiple macro tiles
1871 simd16scalar vPointSize
;
1873 if (rastState
.pointParam
)
1875 simd16vector size
[3];
1876 pa
.Assemble_simd16(VERTEX_POINT_SIZE_SLOT
, size
);
1877 vPointSize
= size
[0].x
;
1881 vPointSize
= _simd16_set1_ps(rastState
.pointSize
);
1884 // bloat point to bbox
1887 bbox
.xmin
= bbox
.xmax
= vXi
;
1888 bbox
.ymin
= bbox
.ymax
= vYi
;
1890 simd16scalar vHalfWidth
= _simd16_mul_ps(vPointSize
, _simd16_set1_ps(0.5f
));
1891 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1893 bbox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1894 bbox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
1895 bbox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1896 bbox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
1898 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1899 // Gather the AOS effective scissor rects based on the per-prim VP index.
1900 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1901 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1902 if (state
.gsState
.emitsViewportArrayIndex
)
1904 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1905 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1907 else // broadcast fast path for non-VPAI case.
1909 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1910 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1911 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1912 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1915 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
1916 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
1917 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
1918 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
1920 // Cull bloated points completely outside scissor
1921 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1922 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1923 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1924 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
1925 primMask
= primMask
& ~maskOutsideScissor
;
1927 // Convert bbox to macrotile units.
1928 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1929 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1930 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1931 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1933 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
1935 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
1936 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
1937 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
1938 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
1940 // store render target array index
1941 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1942 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1944 simd16vector vRtai
[2];
1945 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
1946 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0].x
);
1947 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1951 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1954 OSALIGNSIMD16(float) aPointSize
[KNOB_SIMD16_WIDTH
];
1955 _simd16_store_ps(reinterpret_cast<float *>(aPointSize
), vPointSize
);
1957 uint32_t *pPrimID
= (uint32_t *)&primID
;
1959 OSALIGNSIMD16(float) aPrimVertsX
[KNOB_SIMD16_WIDTH
];
1960 OSALIGNSIMD16(float) aPrimVertsY
[KNOB_SIMD16_WIDTH
];
1961 OSALIGNSIMD16(float) aPrimVertsZ
[KNOB_SIMD16_WIDTH
];
1963 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX
), primVerts
.x
);
1964 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY
), primVerts
.y
);
1965 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ
), primVerts
.z
);
1967 // scan remaining valid prims and bin each separately
1968 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1970 while (_BitScanForward(&primIndex
, primMask
))
1972 uint32_t linkageCount
= backendState
.numAttributes
;
1973 uint32_t numScalarAttribs
= linkageCount
* 4;
1978 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1980 desc
.triFlags
.frontFacing
= 1;
1981 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1982 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1983 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1984 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1986 work
.pfnWork
= RasterizeTriPoint
;
1988 auto pArena
= pDC
->pArena
;
1989 SWR_ASSERT(pArena
!= nullptr);
1991 // store active attribs
1992 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1993 desc
.numAttribs
= linkageCount
;
1994 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1996 // store point vertex data
1997 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1998 desc
.pTriBuffer
= pTriBuffer
;
1999 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
2000 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
2001 *pTriBuffer
= aPrimVertsZ
[primIndex
];
2003 // store user clip distances
2004 if (rastState
.clipDistanceMask
)
2006 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2007 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
2010 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
2011 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
2012 desc
.pUserClipBuffer
[3 * i
+ 0] = 0.0f
;
2013 desc
.pUserClipBuffer
[3 * i
+ 1] = 0.0f
;
2014 desc
.pUserClipBuffer
[3 * i
+ 2] = dists
[i
];
2018 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2019 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2021 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2023 #if KNOB_ENABLE_TOSS_POINTS
2024 if (!KNOB_TOSS_SETUP_TRIS
)
2027 pTileMgr
->enqueue(x
, y
, &work
);
2032 primMask
&= ~(1 << primIndex
);
2036 AR_END(FEBinPoints
, 1);
2039 void SIMDAPI
BinPoints_simd16(
2043 simd16vector prim
[3],
2045 simd16scalari primID
,
2046 simd16scalari viewportIdx
)
2048 simd16vector
& primVerts
= prim
[0];
2050 const API_STATE
& state
= GetApiState(pDC
);
2051 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2052 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2054 if (!feState
.vpTransformDisable
)
2056 // perspective divide
2057 simd16scalar vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), primVerts
.w
);
2059 primVerts
.x
= _simd16_mul_ps(primVerts
.x
, vRecipW0
);
2060 primVerts
.y
= _simd16_mul_ps(primVerts
.y
, vRecipW0
);
2061 primVerts
.z
= _simd16_mul_ps(primVerts
.z
, vRecipW0
);
2063 // viewport transform to screen coords
2064 if (state
.gsState
.emitsViewportArrayIndex
)
2066 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
2070 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
2074 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
2076 primVerts
.x
= _simd16_add_ps(primVerts
.x
, offset
);
2077 primVerts
.y
= _simd16_add_ps(primVerts
.y
, offset
);
2079 BinPostSetupPoints_simd16(
2090 //////////////////////////////////////////////////////////////////////////
2091 /// @brief Bin SIMD lines to the backend.
2092 /// @param pDC - pointer to draw context.
2093 /// @param pa - The primitive assembly object.
2094 /// @param workerId - thread's worker id. Even thread has a unique id.
2095 /// @param tri - Contains line position data for SIMDs worth of points.
2096 /// @param primID - Primitive ID for each line.
2097 /// @param viewportIdx - Viewport Array Index for each line.
2098 void BinPostSetupLines(
2103 simdscalar recipW
[],
2106 simdscalari viewportIdx
)
2108 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2110 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2112 const API_STATE
& state
= GetApiState(pDC
);
2113 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2114 const SWR_GS_STATE
& gsState
= state
.gsState
;
2116 // Select attribute processor
2117 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2118 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2120 simdscalar
& vRecipW0
= recipW
[0];
2121 simdscalar
& vRecipW1
= recipW
[1];
2123 // convert to fixed point
2124 simdscalari vXi
[2], vYi
[2];
2125 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2126 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2127 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2128 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2130 // compute x-major vs y-major mask
2131 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
2132 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
2133 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
2134 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
2136 // cull zero-length lines
2137 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
2138 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
2140 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
2142 uint32_t *pPrimID
= (uint32_t *)&primID
;
2143 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2145 simdscalar vUnused
= _simd_setzero_ps();
2147 // Calc bounding box of lines
2149 bbox
.xmin
= _simd_min_epi32(vXi
[0], vXi
[1]);
2150 bbox
.xmax
= _simd_max_epi32(vXi
[0], vXi
[1]);
2151 bbox
.ymin
= _simd_min_epi32(vYi
[0], vYi
[1]);
2152 bbox
.ymax
= _simd_max_epi32(vYi
[0], vYi
[1]);
2154 // bloat bbox by line width along minor axis
2155 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
2156 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2158 bloatBox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2159 bloatBox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
2160 bloatBox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2161 bloatBox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
2163 bbox
.xmin
= _simd_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2164 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2165 bbox
.ymin
= _simd_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2166 bbox
.ymax
= _simd_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2168 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2169 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2170 if (state
.gsState
.emitsViewportArrayIndex
)
2172 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2173 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2175 else // broadcast fast path for non-VPAI case.
2177 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2178 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2179 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2180 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2183 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
2184 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
2185 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
2186 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
2188 // Cull prims completely outside scissor
2190 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2191 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2192 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2193 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2194 primMask
= primMask
& ~maskOutsideScissor
;
2202 // Convert triangle bbox to macrotile units.
2203 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2204 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2205 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2206 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2208 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2209 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
2210 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
2211 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
2212 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
2214 // transpose verts needed for backend
2215 /// @todo modify BE to take non-transformed verts
2216 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2217 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
2218 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
2219 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
2220 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
2222 // store render target array index
2223 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2224 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2226 simdvector vRtai
[2];
2227 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2228 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2229 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2233 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2236 // scan remaining valid prims and bin each separately
2238 while (_BitScanForward(&primIndex
, primMask
))
2240 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2241 uint32_t numScalarAttribs
= linkageCount
* 4;
2246 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2248 desc
.triFlags
.frontFacing
= 1;
2249 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2250 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2251 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2252 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2254 work
.pfnWork
= RasterizeLine
;
2256 auto pArena
= pDC
->pArena
;
2257 SWR_ASSERT(pArena
!= nullptr);
2259 // store active attribs
2260 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2261 desc
.numAttribs
= linkageCount
;
2262 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2264 // store line vertex data
2265 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2266 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
2267 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
2268 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
2269 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
2271 // store user clip distances
2272 if (rastState
.clipDistanceMask
)
2274 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2275 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2276 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2279 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2280 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2282 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2284 #if KNOB_ENABLE_TOSS_POINTS
2285 if (!KNOB_TOSS_SETUP_TRIS
)
2288 pTileMgr
->enqueue(x
, y
, &work
);
2293 primMask
&= ~(1 << primIndex
);
2298 AR_END(FEBinLines
, 1);
2301 #if USE_SIMD16_FRONTEND
2302 void BinPostSetupLines_simd16(
2306 simd16vector prim
[],
2307 simd16scalar recipW
[],
2309 simd16scalari primID
,
2310 simd16scalari viewportIdx
)
2312 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2314 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2316 const API_STATE
& state
= GetApiState(pDC
);
2317 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2318 const SWR_GS_STATE
& gsState
= state
.gsState
;
2320 // Select attribute processor
2321 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2322 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2324 simd16scalar
& vRecipW0
= recipW
[0];
2325 simd16scalar
& vRecipW1
= recipW
[1];
2327 // convert to fixed point
2328 simd16scalari vXi
[2], vYi
[2];
2330 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2331 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2332 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2333 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2335 // compute x-major vs y-major mask
2336 simd16scalari xLength
= _simd16_abs_epi32(_simd16_sub_epi32(vXi
[0], vXi
[1]));
2337 simd16scalari yLength
= _simd16_abs_epi32(_simd16_sub_epi32(vYi
[0], vYi
[1]));
2338 simd16scalar vYmajorMask
= _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength
, xLength
));
2339 uint32_t yMajorMask
= _simd16_movemask_ps(vYmajorMask
);
2341 // cull zero-length lines
2342 simd16scalari vZeroLengthMask
= _simd16_cmpeq_epi32(xLength
, _simd16_setzero_si());
2343 vZeroLengthMask
= _simd16_and_si(vZeroLengthMask
, _simd16_cmpeq_epi32(yLength
, _simd16_setzero_si()));
2345 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask
));
2347 uint32_t *pPrimID
= (uint32_t *)&primID
;
2348 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2350 // Calc bounding box of lines
2352 bbox
.xmin
= _simd16_min_epi32(vXi
[0], vXi
[1]);
2353 bbox
.xmax
= _simd16_max_epi32(vXi
[0], vXi
[1]);
2354 bbox
.ymin
= _simd16_min_epi32(vYi
[0], vYi
[1]);
2355 bbox
.ymax
= _simd16_max_epi32(vYi
[0], vYi
[1]);
2357 // bloat bbox by line width along minor axis
2358 simd16scalar vHalfWidth
= _simd16_set1_ps(rastState
.lineWidth
/ 2.0f
);
2359 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2361 simd16BBox bloatBox
;
2363 bloatBox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2364 bloatBox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
2365 bloatBox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2366 bloatBox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
2368 bbox
.xmin
= _simd16_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2369 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2370 bbox
.ymin
= _simd16_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2371 bbox
.ymax
= _simd16_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2373 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2374 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2376 if (state
.gsState
.emitsViewportArrayIndex
)
2378 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2379 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2381 else // broadcast fast path for non-VPAI case.
2383 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2384 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2385 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2386 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2389 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
2390 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
2391 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
2392 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
2394 // Cull prims completely outside scissor
2396 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2397 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2398 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2399 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
2400 primMask
= primMask
& ~maskOutsideScissor
;
2403 const simdscalar unused
= _simd_setzero_ps();
2410 // Convert triangle bbox to macrotile units.
2411 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2412 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2413 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2414 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2416 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
2418 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
2419 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
2420 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
2421 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
2423 // transpose verts needed for backend
2424 /// @todo modify BE to take non-transformed verts
2425 __m128 vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2426 __m128 vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2427 __m128 vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2428 __m128 vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2430 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(prim
[0].x
, 0), _simd16_extract_ps(prim
[1].x
, 0), unused
);
2431 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(prim
[0].y
, 0), _simd16_extract_ps(prim
[1].y
, 0), unused
);
2432 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(prim
[0].z
, 0), _simd16_extract_ps(prim
[1].z
, 0), unused
);
2433 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), unused
);
2435 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(prim
[0].x
, 1), _simd16_extract_ps(prim
[1].x
, 1), unused
);
2436 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(prim
[0].y
, 1), _simd16_extract_ps(prim
[1].y
, 1), unused
);
2437 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(prim
[0].z
, 1), _simd16_extract_ps(prim
[1].z
, 1), unused
);
2438 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), unused
);
2440 // store render target array index
2441 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
2442 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2444 simd16vector vRtai
[2];
2445 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
2446 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0].x
);
2447 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
2451 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
2454 // scan remaining valid prims and bin each separately
2456 while (_BitScanForward(&primIndex
, primMask
))
2458 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2459 uint32_t numScalarAttribs
= linkageCount
* 4;
2464 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2466 desc
.triFlags
.frontFacing
= 1;
2467 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2468 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2469 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2470 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2472 work
.pfnWork
= RasterizeLine
;
2474 auto pArena
= pDC
->pArena
;
2475 SWR_ASSERT(pArena
!= nullptr);
2477 // store active attribs
2478 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2479 desc
.numAttribs
= linkageCount
;
2480 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2482 // store line vertex data
2483 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2486 const uint32_t i
= primIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
2487 const uint32_t j
= primIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
2489 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
2490 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
2491 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
2492 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
2495 // store user clip distances
2496 if (rastState
.clipDistanceMask
)
2498 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2499 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2500 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2503 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2504 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2506 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2508 #if KNOB_ENABLE_TOSS_POINTS
2509 if (!KNOB_TOSS_SETUP_TRIS
)
2512 pTileMgr
->enqueue(x
, y
, &work
);
2517 primMask
&= ~(1 << primIndex
);
2522 AR_END(FEBinLines
, 1);
2526 //////////////////////////////////////////////////////////////////////////
2527 /// @brief Bin SIMD lines to the backend.
2528 /// @param pDC - pointer to draw context.
2529 /// @param pa - The primitive assembly object.
2530 /// @param workerId - thread's worker id. Even thread has a unique id.
2531 /// @param tri - Contains line position data for SIMDs worth of points.
2532 /// @param primID - Primitive ID for each line.
2533 /// @param viewportIdx - Viewport Array Index for each line.
2541 simdscalari viewportIdx
)
2543 const API_STATE
& state
= GetApiState(pDC
);
2544 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2545 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2547 simdscalar vRecipW
[2] = { _simd_set1_ps(1.0f
), _simd_set1_ps(1.0f
) };
2549 if (!feState
.vpTransformDisable
)
2551 // perspective divide
2552 vRecipW
[0] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
2553 vRecipW
[1] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
2555 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2556 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2558 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2559 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2561 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2562 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2564 // viewport transform to screen coords
2565 if (state
.gsState
.emitsViewportArrayIndex
)
2567 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2571 viewportTransform
<2>(prim
, state
.vpMatrices
);
2575 // adjust for pixel center location
2576 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2577 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
2578 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
2580 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
2581 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);
2594 #if USE_SIMD16_FRONTEND
2595 void SIMDAPI
BinLines_simd16(
2599 simd16vector prim
[3],
2601 simd16scalari primID
,
2602 simd16scalari viewportIdx
)
2604 const API_STATE
& state
= GetApiState(pDC
);
2605 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2606 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2608 simd16scalar vRecipW
[2] = { _simd16_set1_ps(1.0f
), _simd16_set1_ps(1.0f
) };
2610 if (!feState
.vpTransformDisable
)
2612 // perspective divide
2613 vRecipW
[0] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[0].w
);
2614 vRecipW
[1] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[1].w
);
2616 prim
[0].v
[0] = _simd16_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2617 prim
[1].v
[0] = _simd16_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2619 prim
[0].v
[1] = _simd16_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2620 prim
[1].v
[1] = _simd16_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2622 prim
[0].v
[2] = _simd16_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2623 prim
[1].v
[2] = _simd16_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2625 // viewport transform to screen coords
2626 if (state
.gsState
.emitsViewportArrayIndex
)
2628 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2632 viewportTransform
<2>(prim
, state
.vpMatrices
);
2636 // adjust for pixel center location
2637 simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
2639 prim
[0].x
= _simd16_add_ps(prim
[0].x
, offset
);
2640 prim
[0].y
= _simd16_add_ps(prim
[0].y
, offset
);
2642 prim
[1].x
= _simd16_add_ps(prim
[1].x
, offset
);
2643 prim
[1].y
= _simd16_add_ps(prim
[1].y
, offset
);
2645 BinPostSetupLines_simd16(