1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for the macrotile binner
27 ******************************************************************************/
31 #include "conservativeRast.h"
33 #include "rasterizer.h"
34 #include "rdtsc_core.h"
38 void BinPostSetupLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], simdscalar vRecipW
[2], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
39 void BinPostSetupPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
41 #if USE_SIMD16_FRONTEND
42 void BinPostSetupLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], simd16scalar vRecipW
[2], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
43 void BinPostSetupPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
46 //////////////////////////////////////////////////////////////////////////
47 /// @brief Offsets added to post-viewport vertex positions based on
49 static const simdscalar g_pixelOffsets
[SWR_PIXEL_LOCATION_UL
+ 1] =
51 _simd_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
52 _simd_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
55 #if USE_SIMD16_FRONTEND
56 static const simd16scalar g_pixelOffsets_simd16
[SWR_PIXEL_LOCATION_UL
+ 1] =
58 _simd16_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
59 _simd16_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
63 //////////////////////////////////////////////////////////////////////////
64 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
65 /// Point precision from FP32.
66 template <typename PT
= FixedPointTraits
<Fixed_16_8
>>
67 INLINE simdscalari
fpToFixedPointVertical(const simdscalar vIn
)
69 simdscalar vFixed
= _simd_mul_ps(vIn
, _simd_set1_ps(PT::ScaleT::value
));
70 return _simd_cvtps_epi32(vFixed
);
73 #if USE_SIMD16_FRONTEND
74 template <typename PT
= FixedPointTraits
<Fixed_16_8
>>
75 INLINE simd16scalari
fpToFixedPointVertical(const simd16scalar vIn
)
77 simd16scalar vFixed
= _simd16_mul_ps(vIn
, _simd16_set1_ps(PT::ScaleT::value
));
78 return _simd16_cvtps_epi32(vFixed
);
82 //////////////////////////////////////////////////////////////////////////
83 /// @brief Helper function to set the X,Y coords of a triangle to the
84 /// requested Fixed Point precision from FP32.
85 /// @param tri: simdvector[3] of FP triangle verts
86 /// @param vXi: fixed point X coords of tri verts
87 /// @param vYi: fixed point Y coords of tri verts
88 INLINE
static void FPToFixedPoint(const simdvector
* const tri
, simdscalari(&vXi
)[3], simdscalari(&vYi
)[3])
90 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
91 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
92 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
93 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
94 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
95 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
98 #if USE_SIMD16_FRONTEND
99 INLINE
static void FPToFixedPoint(const simd16vector
* const tri
, simd16scalari(&vXi
)[3], simd16scalari(&vYi
)[3])
101 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
102 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
103 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
104 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
105 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
106 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Calculate bounding box for current triangle
112 /// @tparam CT: ConservativeRastFETraits type
113 /// @param vX: fixed point X position for triangle verts
114 /// @param vY: fixed point Y position for triangle verts
115 /// @param bbox: fixed point bbox
116 /// *Note*: expects vX, vY to be in the correct precision for the type
117 /// of rasterization. This avoids unnecessary FP->fixed conversions.
118 template <typename CT
>
119 INLINE
void calcBoundingBoxIntVertical(const simdvector
* const tri
, simdscalari(&vX
)[3], simdscalari(&vY
)[3], simdBBox
&bbox
)
121 simdscalari vMinX
= vX
[0];
122 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
123 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
125 simdscalari vMaxX
= vX
[0];
126 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
127 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
129 simdscalari vMinY
= vY
[0];
130 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
131 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
133 simdscalari vMaxY
= vY
[0];
134 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
135 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
143 #if USE_SIMD16_FRONTEND
144 template <typename CT
>
145 INLINE
void calcBoundingBoxIntVertical(const simd16vector
* const tri
, simd16scalari(&vX
)[3], simd16scalari(&vY
)[3], simd16BBox
&bbox
)
147 simd16scalari vMinX
= vX
[0];
149 vMinX
= _simd16_min_epi32(vMinX
, vX
[1]);
150 vMinX
= _simd16_min_epi32(vMinX
, vX
[2]);
152 simd16scalari vMaxX
= vX
[0];
154 vMaxX
= _simd16_max_epi32(vMaxX
, vX
[1]);
155 vMaxX
= _simd16_max_epi32(vMaxX
, vX
[2]);
157 simd16scalari vMinY
= vY
[0];
159 vMinY
= _simd16_min_epi32(vMinY
, vY
[1]);
160 vMinY
= _simd16_min_epi32(vMinY
, vY
[2]);
162 simd16scalari vMaxY
= vY
[0];
164 vMaxY
= _simd16_max_epi32(vMaxY
, vY
[1]);
165 vMaxY
= _simd16_max_epi32(vMaxY
, vY
[2]);
174 //////////////////////////////////////////////////////////////////////////
175 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
176 /// Offsets BBox for conservative rast
178 INLINE
void calcBoundingBoxIntVertical
<FEConservativeRastT
>(const simdvector
* const tri
, simdscalari(&vX
)[3], simdscalari(&vY
)[3], simdBBox
&bbox
)
180 // FE conservative rast traits
181 typedef FEConservativeRastT CT
;
183 simdscalari vMinX
= vX
[0];
184 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
185 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
187 simdscalari vMaxX
= vX
[0];
188 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
189 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
191 simdscalari vMinY
= vY
[0];
192 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
193 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
195 simdscalari vMaxY
= vY
[0];
196 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
197 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
199 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
200 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
201 bbox
.xmin
= _simd_sub_epi32(vMinX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
202 bbox
.xmax
= _simd_add_epi32(vMaxX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
203 bbox
.ymin
= _simd_sub_epi32(vMinY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
204 bbox
.ymax
= _simd_add_epi32(vMaxY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
207 #if USE_SIMD16_FRONTEND
209 INLINE
void calcBoundingBoxIntVertical
<FEConservativeRastT
>(const simd16vector
* const tri
, simd16scalari(&vX
)[3], simd16scalari(&vY
)[3], simd16BBox
&bbox
)
211 // FE conservative rast traits
212 typedef FEConservativeRastT CT
;
214 simd16scalari vMinX
= vX
[0];
215 vMinX
= _simd16_min_epi32(vMinX
, vX
[1]);
216 vMinX
= _simd16_min_epi32(vMinX
, vX
[2]);
218 simd16scalari vMaxX
= vX
[0];
219 vMaxX
= _simd16_max_epi32(vMaxX
, vX
[1]);
220 vMaxX
= _simd16_max_epi32(vMaxX
, vX
[2]);
222 simd16scalari vMinY
= vY
[0];
223 vMinY
= _simd16_min_epi32(vMinY
, vY
[1]);
224 vMinY
= _simd16_min_epi32(vMinY
, vY
[2]);
226 simd16scalari vMaxY
= vY
[0];
227 vMaxY
= _simd16_max_epi32(vMaxY
, vY
[1]);
228 vMaxY
= _simd16_max_epi32(vMaxY
, vY
[2]);
230 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
231 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
232 bbox
.xmin
= _simd16_sub_epi32(vMinX
, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value
));
233 bbox
.xmax
= _simd16_add_epi32(vMaxX
, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value
));
234 bbox
.ymin
= _simd16_sub_epi32(vMinY
, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value
));
235 bbox
.ymax
= _simd16_add_epi32(vMaxY
, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value
));
239 //////////////////////////////////////////////////////////////////////////
240 /// @brief Processes attributes for the backend based on linkage mask and
241 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
242 /// @param pDC - Draw context
243 /// @param pa - Primitive Assembly state
244 /// @param linkageMask - Specifies which VS outputs are routed to PS.
245 /// @param pLinkageMap - maps VS attribute slot to PS slot
246 /// @param triIndex - Triangle to process attributes for
247 /// @param pBuffer - Output result
248 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
249 INLINE
void ProcessAttributes(
256 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
257 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
258 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
259 LONG constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
260 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
261 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
263 static const float constTable
[3][4] = {
264 { 0.0f
, 0.0f
, 0.0f
, 0.0f
},
265 { 0.0f
, 0.0f
, 0.0f
, 1.0f
},
266 { 1.0f
, 1.0f
, 1.0f
, 1.0f
}
269 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
272 if (IsSwizzledT::value
)
274 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
275 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ attribSwizzle
.sourceAttrib
;
280 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ i
;
283 __m128 attrib
[3]; // triangle attribs (always 4 wide)
284 float* pAttribStart
= pBuffer
;
286 if (HasConstantInterpT::value
|| IsDegenerate::value
)
288 if (_bittest(&constantInterpMask
, i
))
291 uint32_t adjustedTriIndex
;
292 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
293 static const int32_t quadProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
294 static const uint32_t quadProvokingVertex
[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
295 static const int32_t qstripProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
296 static const uint32_t qstripProvokingVertex
[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
300 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
301 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
304 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
305 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
307 case TOP_TRIANGLE_STRIP
:
308 adjustedTriIndex
= triIndex
;
310 ? tristripProvokingVertex
[provokingVertex
]
314 adjustedTriIndex
= triIndex
;
315 vid
= provokingVertex
;
319 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
321 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
323 _mm_store_ps(pBuffer
, attrib
[vid
]);
329 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
331 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
333 _mm_store_ps(pBuffer
, attrib
[i
]);
340 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
342 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
344 _mm_store_ps(pBuffer
, attrib
[i
]);
349 // pad out the attrib buffer to 3 verts to ensure the triangle
350 // interpolation code in the pixel shader works correctly for the
351 // 3 topologies - point, line, tri. This effectively zeros out the
352 // effect of the missing vertices in the triangle interpolation.
353 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
355 _mm_store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
359 // check for constant source overrides
360 if (IsSwizzledT::value
)
362 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
366 while (_BitScanForward(&comp
, mask
))
368 mask
&= ~(1 << comp
);
370 float constantValue
= 0.0f
;
371 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
373 case SWR_CONSTANT_SOURCE_CONST_0000
:
374 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
375 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
376 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
378 case SWR_CONSTANT_SOURCE_PRIM_ID
:
379 constantValue
= *(float*)&primId
;
383 // apply constant value to all 3 vertices
384 for (uint32_t v
= 0; v
< 3; ++v
)
386 pAttribStart
[comp
+ v
* 4] = constantValue
;
394 //////////////////////////////////////////////////////////////////////////
395 /// @brief Gather scissor rect data based on per-prim viewport indices.
396 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
397 /// @param pViewportIndex - array of per-primitive vewport indexes.
398 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
399 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
400 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
401 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
403 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
404 template<size_t SimdWidth
>
405 struct GatherScissors
407 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
408 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
409 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
411 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
416 struct GatherScissors
<8>
418 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
419 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
420 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
422 scisXmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
423 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
424 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
425 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
426 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
427 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
428 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
429 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
);
430 scisYmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
431 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
432 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
433 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
434 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
435 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
436 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
437 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
);
438 scisXmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
439 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
440 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
441 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
442 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
443 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
444 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
445 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
);
446 scisYmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
447 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
448 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
449 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
450 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
451 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
452 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
453 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
);
457 #if USE_SIMD16_FRONTEND
458 template<size_t SimdWidth
>
459 struct GatherScissors_simd16
461 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
462 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
463 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
465 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
470 struct GatherScissors_simd16
<16>
472 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
473 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
474 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
476 scisXmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
477 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
478 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
479 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
480 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
481 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
482 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
483 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
,
484 pScissorsInFixedPoint
[pViewportIndex
[8]].xmin
,
485 pScissorsInFixedPoint
[pViewportIndex
[9]].xmin
,
486 pScissorsInFixedPoint
[pViewportIndex
[10]].xmin
,
487 pScissorsInFixedPoint
[pViewportIndex
[11]].xmin
,
488 pScissorsInFixedPoint
[pViewportIndex
[12]].xmin
,
489 pScissorsInFixedPoint
[pViewportIndex
[13]].xmin
,
490 pScissorsInFixedPoint
[pViewportIndex
[14]].xmin
,
491 pScissorsInFixedPoint
[pViewportIndex
[15]].xmin
);
493 scisYmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
494 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
495 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
496 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
497 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
498 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
499 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
500 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
,
501 pScissorsInFixedPoint
[pViewportIndex
[8]].ymin
,
502 pScissorsInFixedPoint
[pViewportIndex
[9]].ymin
,
503 pScissorsInFixedPoint
[pViewportIndex
[10]].ymin
,
504 pScissorsInFixedPoint
[pViewportIndex
[11]].ymin
,
505 pScissorsInFixedPoint
[pViewportIndex
[12]].ymin
,
506 pScissorsInFixedPoint
[pViewportIndex
[13]].ymin
,
507 pScissorsInFixedPoint
[pViewportIndex
[14]].ymin
,
508 pScissorsInFixedPoint
[pViewportIndex
[15]].ymin
);
510 scisXmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
511 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
512 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
513 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
514 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
515 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
516 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
517 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
,
518 pScissorsInFixedPoint
[pViewportIndex
[8]].xmax
,
519 pScissorsInFixedPoint
[pViewportIndex
[9]].xmax
,
520 pScissorsInFixedPoint
[pViewportIndex
[10]].xmax
,
521 pScissorsInFixedPoint
[pViewportIndex
[11]].xmax
,
522 pScissorsInFixedPoint
[pViewportIndex
[12]].xmax
,
523 pScissorsInFixedPoint
[pViewportIndex
[13]].xmax
,
524 pScissorsInFixedPoint
[pViewportIndex
[14]].xmax
,
525 pScissorsInFixedPoint
[pViewportIndex
[15]].xmax
);
527 scisYmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
528 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
529 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
530 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
531 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
532 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
533 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
534 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
,
535 pScissorsInFixedPoint
[pViewportIndex
[8]].ymax
,
536 pScissorsInFixedPoint
[pViewportIndex
[9]].ymax
,
537 pScissorsInFixedPoint
[pViewportIndex
[10]].ymax
,
538 pScissorsInFixedPoint
[pViewportIndex
[11]].ymax
,
539 pScissorsInFixedPoint
[pViewportIndex
[12]].ymax
,
540 pScissorsInFixedPoint
[pViewportIndex
[13]].ymax
,
541 pScissorsInFixedPoint
[pViewportIndex
[14]].ymax
,
542 pScissorsInFixedPoint
[pViewportIndex
[15]].ymax
);
547 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
549 struct ProcessAttributesChooser
551 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
553 template <typename
... ArgsB
>
554 static FuncType
GetFunc()
556 return ProcessAttributes
<ArgsB
...>;
560 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
562 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
565 //////////////////////////////////////////////////////////////////////////
566 /// @brief Processes enabled user clip distances. Loads the active clip
567 /// distances from the PA, sets up barycentric equations, and
568 /// stores the results to the output buffer
569 /// @param pa - Primitive Assembly state
570 /// @param primIndex - primitive index to process
571 /// @param clipDistMask - mask of enabled clip distances
572 /// @param pUserClipBuffer - buffer to store results
573 template<uint32_t NumVerts
>
574 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float *pRecipW
, float* pUserClipBuffer
)
577 while (_BitScanForward(&clipDist
, clipDistMask
))
579 clipDistMask
&= ~(1 << clipDist
);
580 uint32_t clipSlot
= clipDist
>> 2;
581 uint32_t clipComp
= clipDist
& 0x3;
582 uint32_t clipAttribSlot
= clipSlot
== 0 ?
583 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
585 __m128 primClipDist
[3];
586 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
588 float vertClipDist
[NumVerts
];
589 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
591 OSALIGNSIMD(float) aVertClipDist
[4];
592 _mm_store_ps(aVertClipDist
, primClipDist
[e
]);
593 vertClipDist
[e
] = aVertClipDist
[clipComp
];
596 // setup plane equations for barycentric interpolation in the backend
597 float baryCoeff
[NumVerts
];
598 float last
= vertClipDist
[NumVerts
- 1] * pRecipW
[NumVerts
- 1];
599 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
601 baryCoeff
[e
] = vertClipDist
[e
] * pRecipW
[e
] - last
;
603 baryCoeff
[NumVerts
- 1] = last
;
605 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
607 *(pUserClipBuffer
++) = baryCoeff
[e
];
612 //////////////////////////////////////////////////////////////////////////
613 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
614 /// culling, viewport transform, etc.
615 /// @param pDC - pointer to draw context.
616 /// @param pa - The primitive assembly object.
617 /// @param workerId - thread's worker id. Even thread has a unique id.
618 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
619 /// @param primID - Primitive ID for each triangle.
620 /// @param viewportIdx - viewport array index for each triangle.
621 /// @tparam CT - ConservativeRastFETraits
622 template <typename CT
>
630 simdscalari viewportIdx
)
632 SWR_CONTEXT
*pContext
= pDC
->pContext
;
634 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
636 const API_STATE
& state
= GetApiState(pDC
);
637 const SWR_RASTSTATE
& rastState
= state
.rastState
;
638 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
639 const SWR_GS_STATE
& gsState
= state
.gsState
;
640 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
642 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
643 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
644 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
646 if (feState
.vpTransformDisable
)
648 // RHW is passed in directly when VP transform is disabled
649 vRecipW0
= tri
[0].v
[3];
650 vRecipW1
= tri
[1].v
[3];
651 vRecipW2
= tri
[2].v
[3];
655 // Perspective divide
656 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
657 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
658 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
660 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
661 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
662 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
664 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
665 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
666 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
668 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
669 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
670 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
672 // Viewport transform to screen space coords
673 if (state
.gsState
.emitsViewportArrayIndex
)
675 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
679 viewportTransform
<3>(tri
, state
.vpMatrices
);
683 // Adjust for pixel center location
684 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
685 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
686 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
688 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
689 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
691 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
692 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
694 simdscalari vXi
[3], vYi
[3];
695 // Set vXi, vYi to required fixed point precision
696 FPToFixedPoint(tri
, vXi
, vYi
);
699 simdscalari vAi
[3], vBi
[3];
700 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
704 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
707 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
708 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
710 int cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
712 uint32_t origTriMask
= triMask
;
713 // don't cull degenerate triangles if we're conservatively rasterizing
714 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
716 triMask
&= ~cullZeroAreaMask
;
719 // determine front winding tris
722 // 0 area triangles are marked as backfacing regardless of winding order,
723 // which is required behavior for conservative rast and wireframe rendering
724 uint32_t frontWindingTris
;
725 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
727 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
728 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
732 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[0])));
733 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[1])));
735 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
739 switch ((SWR_CULLMODE
)rastState
.cullMode
)
741 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
742 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
743 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
744 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
745 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
746 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
749 triMask
&= ~cullTris
;
751 if (origTriMask
^ triMask
)
753 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
756 // Simple non-conformant wireframe mode, useful for debugging
757 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
759 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
761 simdscalar recipW
[2];
764 recipW
[0] = vRecipW0
;
765 recipW
[1] = vRecipW1
;
766 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
770 recipW
[0] = vRecipW1
;
771 recipW
[1] = vRecipW2
;
772 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
776 recipW
[0] = vRecipW2
;
777 recipW
[1] = vRecipW0
;
778 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
780 AR_END(FEBinTriangles
, 1);
782 } else if (rastState
.fillMode
== SWR_FILLMODE_POINT
)
786 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[0], triMask
, primID
, viewportIdx
);
787 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[1], triMask
, primID
, viewportIdx
);
788 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[2], triMask
, primID
, viewportIdx
);
792 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
793 // compute per tri backface
794 uint32_t frontFaceMask
= frontWindingTris
;
795 uint32_t *pPrimID
= (uint32_t *)&primID
;
796 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
799 PFN_WORK_FUNC pfnWork
;
800 if (CT::IsConservativeT::value
)
802 // determine which edges of the degenerate tri, if any, are valid to rasterize.
803 // used to call the appropriate templated rasterizer function
804 if (cullZeroAreaMask
> 0)
807 simdscalari x0x1Mask
= _simd_cmpeq_epi32(vXi
[0], vXi
[1]);
808 simdscalari y0y1Mask
= _simd_cmpeq_epi32(vYi
[0], vYi
[1]);
809 uint32_t e0Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask
, y0y1Mask
)));
812 simdscalari x1x2Mask
= _simd_cmpeq_epi32(vXi
[1], vXi
[2]);
813 simdscalari y1y2Mask
= _simd_cmpeq_epi32(vYi
[1], vYi
[2]);
814 uint32_t e1Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask
, y1y2Mask
)));
817 // if v0 == v1 & v1 == v2, v0 == v2
818 uint32_t e2Mask
= e0Mask
& e1Mask
;
819 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
821 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
822 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
823 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
824 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
825 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
826 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
827 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
829 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
833 edgeEnable
= 0x00FFFFFF;
838 // degenerate triangles won't be sent to rasterizer; just enable all edges
839 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
840 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
845 goto endBinTriangles
;
848 // Calc bounding box of triangles
850 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
852 // determine if triangle falls between pixel centers and discard
853 // only discard for non-MSAA case and when conservative rast is disabled
854 // (xmin + 127) & ~255
855 // (xmax + 128) & ~255
856 if((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
857 (!CT::IsConservativeT::value
))
859 origTriMask
= triMask
;
863 simdscalari xmin
= _simd_add_epi32(bbox
.xmin
, _simd_set1_epi32(127));
864 xmin
= _simd_and_si(xmin
, _simd_set1_epi32(~255));
865 simdscalari xmax
= _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(128));
866 xmax
= _simd_and_si(xmax
, _simd_set1_epi32(~255));
868 simdscalari vMaskH
= _simd_cmpeq_epi32(xmin
, xmax
);
870 simdscalari ymin
= _simd_add_epi32(bbox
.ymin
, _simd_set1_epi32(127));
871 ymin
= _simd_and_si(ymin
, _simd_set1_epi32(~255));
872 simdscalari ymax
= _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(128));
873 ymax
= _simd_and_si(ymax
, _simd_set1_epi32(~255));
875 simdscalari vMaskV
= _simd_cmpeq_epi32(ymin
, ymax
);
876 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
877 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
880 triMask
&= ~cullCenterMask
;
882 if (origTriMask
^ triMask
)
884 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
888 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
889 // Gather the AOS effective scissor rects based on the per-prim VP index.
890 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
891 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
892 if (state
.gsState
.emitsViewportArrayIndex
)
894 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
895 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
897 else // broadcast fast path for non-VPAI case.
899 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
900 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
901 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
902 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
905 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
906 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
907 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
908 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
910 if (CT::IsConservativeT::value
)
912 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
913 // some area. Bump the xmax/ymax edges out
914 simdscalari topEqualsBottom
= _simd_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
915 bbox
.ymax
= _simd_blendv_epi32(bbox
.ymax
, _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(1)), topEqualsBottom
);
916 simdscalari leftEqualsRight
= _simd_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
917 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(1)), leftEqualsRight
);
920 // Cull tris completely outside scissor
922 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
923 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
924 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
925 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
926 triMask
= triMask
& ~maskOutsideScissor
;
931 goto endBinTriangles
;
934 // Convert triangle bbox to macrotile units.
935 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
936 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
937 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
938 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
940 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
941 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
942 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
943 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
944 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
946 // transpose verts needed for backend
947 /// @todo modify BE to take non-transformed verts
948 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
949 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
950 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
951 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
952 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
954 // store render target array index
955 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
956 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
959 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
961 vRtaii
= _simd_castps_si(vRtai
[0].x
);
962 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
966 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
971 // scan remaining valid triangles and bin each separately
972 while (_BitScanForward(&triIndex
, triMask
))
974 uint32_t linkageCount
= state
.backendState
.numAttributes
;
975 uint32_t numScalarAttribs
= linkageCount
* 4;
981 if (CT::IsConservativeT::value
)
983 // only rasterize valid edges if we have a degenerate primitive
984 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
985 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
986 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
988 // Degenerate triangles are required to be constant interpolated
989 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
993 isDegenerate
= false;
994 work
.pfnWork
= pfnWork
;
997 // Select attribute processor
998 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
999 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
1001 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1003 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
1004 desc
.triFlags
.primID
= pPrimID
[triIndex
];
1005 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
1006 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
1008 auto pArena
= pDC
->pArena
;
1009 SWR_ASSERT(pArena
!= nullptr);
1011 // store active attribs
1012 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1013 desc
.pAttribs
= pAttribs
;
1014 desc
.numAttribs
= linkageCount
;
1015 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
1017 // store triangle vertex data
1018 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1020 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
1021 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
1022 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
1023 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
1025 // store user clip distances
1026 if (rastState
.clipDistanceMask
)
1028 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1029 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1030 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1033 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
1035 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
1037 #if KNOB_ENABLE_TOSS_POINTS
1038 if (!KNOB_TOSS_SETUP_TRIS
)
1041 pTileMgr
->enqueue(x
, y
, &work
);
1045 triMask
&= ~(1 << triIndex
);
1048 AR_END(FEBinTriangles
, 1);
1051 #if USE_SIMD16_FRONTEND
1052 template <typename CT
>
1053 void SIMDAPI
BinTriangles_simd16(
1057 simd16vector tri
[3],
1059 simd16scalari primID
,
1060 simd16scalari viewportIdx
)
1062 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1064 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
1066 const API_STATE
& state
= GetApiState(pDC
);
1067 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1068 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1069 const SWR_GS_STATE
& gsState
= state
.gsState
;
1071 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1073 simd16scalar vRecipW0
= _simd16_set1_ps(1.0f
);
1074 simd16scalar vRecipW1
= _simd16_set1_ps(1.0f
);
1075 simd16scalar vRecipW2
= _simd16_set1_ps(1.0f
);
1077 if (feState
.vpTransformDisable
)
1079 // RHW is passed in directly when VP transform is disabled
1080 vRecipW0
= tri
[0].v
[3];
1081 vRecipW1
= tri
[1].v
[3];
1082 vRecipW2
= tri
[2].v
[3];
1086 // Perspective divide
1087 vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[0].w
);
1088 vRecipW1
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[1].w
);
1089 vRecipW2
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[2].w
);
1091 tri
[0].v
[0] = _simd16_mul_ps(tri
[0].v
[0], vRecipW0
);
1092 tri
[1].v
[0] = _simd16_mul_ps(tri
[1].v
[0], vRecipW1
);
1093 tri
[2].v
[0] = _simd16_mul_ps(tri
[2].v
[0], vRecipW2
);
1095 tri
[0].v
[1] = _simd16_mul_ps(tri
[0].v
[1], vRecipW0
);
1096 tri
[1].v
[1] = _simd16_mul_ps(tri
[1].v
[1], vRecipW1
);
1097 tri
[2].v
[1] = _simd16_mul_ps(tri
[2].v
[1], vRecipW2
);
1099 tri
[0].v
[2] = _simd16_mul_ps(tri
[0].v
[2], vRecipW0
);
1100 tri
[1].v
[2] = _simd16_mul_ps(tri
[1].v
[2], vRecipW1
);
1101 tri
[2].v
[2] = _simd16_mul_ps(tri
[2].v
[2], vRecipW2
);
1103 // Viewport transform to screen space coords
1104 if (state
.gsState
.emitsViewportArrayIndex
)
1106 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
1110 viewportTransform
<3>(tri
, state
.vpMatrices
);
1114 // Adjust for pixel center location
1115 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
1117 tri
[0].x
= _simd16_add_ps(tri
[0].x
, offset
);
1118 tri
[0].y
= _simd16_add_ps(tri
[0].y
, offset
);
1120 tri
[1].x
= _simd16_add_ps(tri
[1].x
, offset
);
1121 tri
[1].y
= _simd16_add_ps(tri
[1].y
, offset
);
1123 tri
[2].x
= _simd16_add_ps(tri
[2].x
, offset
);
1124 tri
[2].y
= _simd16_add_ps(tri
[2].y
, offset
);
1126 simd16scalari vXi
[3], vYi
[3];
1128 // Set vXi, vYi to required fixed point precision
1129 FPToFixedPoint(tri
, vXi
, vYi
);
1132 simd16scalari vAi
[3], vBi
[3];
1133 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
1136 simd16scalari vDet
[2];
1137 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
1140 uint32_t maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[0], _simd16_setzero_si())));
1141 uint32_t maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[1], _simd16_setzero_si())));
1143 uint32_t cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
1145 // don't cull degenerate triangles if we're conservatively rasterizing
1146 uint32_t origTriMask
= triMask
;
1147 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
1149 triMask
&= ~cullZeroAreaMask
;
1152 // determine front winding tris
1155 // 0 area triangles are marked as backfacing regardless of winding order,
1156 // which is required behavior for conservative rast and wireframe rendering
1157 uint32_t frontWindingTris
;
1158 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
1160 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[0], _simd16_setzero_si())));
1161 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[1], _simd16_setzero_si())));
1165 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[0])));
1166 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[1])));
1168 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
1172 switch ((SWR_CULLMODE
)rastState
.cullMode
)
1174 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
1175 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
1176 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
1177 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1178 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
1179 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
1182 triMask
&= ~cullTris
;
1184 if (origTriMask
^ triMask
)
1186 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1189 // Simple non-conformant wireframe mode, useful for debugging
1190 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
1192 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1193 simd16vector line
[2];
1194 simd16scalar recipW
[2];
1197 recipW
[0] = vRecipW0
;
1198 recipW
[1] = vRecipW1
;
1199 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1203 recipW
[0] = vRecipW1
;
1204 recipW
[1] = vRecipW2
;
1205 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1209 recipW
[0] = vRecipW2
;
1210 recipW
[1] = vRecipW0
;
1211 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1213 AR_END(FEBinTriangles
, 1);
1217 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1218 // compute per tri backface
1219 uint32_t frontFaceMask
= frontWindingTris
;
1220 uint32_t *pPrimID
= (uint32_t *)&primID
;
1221 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1224 uint32_t edgeEnable
;
1225 PFN_WORK_FUNC pfnWork
;
1226 if (CT::IsConservativeT::value
)
1228 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1229 // used to call the appropriate templated rasterizer function
1230 if (cullZeroAreaMask
> 0)
1233 const simd16scalari x0x1Mask
= _simd16_cmpeq_epi32(vXi
[0], vXi
[1]);
1234 const simd16scalari y0y1Mask
= _simd16_cmpeq_epi32(vYi
[0], vYi
[1]);
1236 uint32_t e0Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask
, y0y1Mask
)));
1239 const simd16scalari x1x2Mask
= _simd16_cmpeq_epi32(vXi
[1], vXi
[2]);
1240 const simd16scalari y1y2Mask
= _simd16_cmpeq_epi32(vYi
[1], vYi
[2]);
1242 uint32_t e1Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask
, y1y2Mask
)));
1245 // if v0 == v1 & v1 == v2, v0 == v2
1246 uint32_t e2Mask
= e0Mask
& e1Mask
;
1247 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
1249 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1250 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1251 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
1253 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1254 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
1256 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1257 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
1259 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
1263 edgeEnable
= 0x00FFFFFF;
1268 // degenerate triangles won't be sent to rasterizer; just enable all edges
1269 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1270 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
1275 goto endBinTriangles
;
1278 // Calc bounding box of triangles
1280 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
1282 // determine if triangle falls between pixel centers and discard
1283 // only discard for non-MSAA case and when conservative rast is disabled
1284 // (xmin + 127) & ~255
1285 // (xmax + 128) & ~255
1286 if ((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
1287 (!CT::IsConservativeT::value
))
1289 origTriMask
= triMask
;
1294 simd16scalari xmin
= _simd16_add_epi32(bbox
.xmin
, _simd16_set1_epi32(127));
1295 xmin
= _simd16_and_si(xmin
, _simd16_set1_epi32(~255));
1296 simd16scalari xmax
= _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(128));
1297 xmax
= _simd16_and_si(xmax
, _simd16_set1_epi32(~255));
1299 simd16scalari vMaskH
= _simd16_cmpeq_epi32(xmin
, xmax
);
1301 simd16scalari ymin
= _simd16_add_epi32(bbox
.ymin
, _simd16_set1_epi32(127));
1302 ymin
= _simd16_and_si(ymin
, _simd16_set1_epi32(~255));
1303 simd16scalari ymax
= _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(128));
1304 ymax
= _simd16_and_si(ymax
, _simd16_set1_epi32(~255));
1306 simd16scalari vMaskV
= _simd16_cmpeq_epi32(ymin
, ymax
);
1308 vMaskV
= _simd16_or_si(vMaskH
, vMaskV
);
1309 cullCenterMask
= _simd16_movemask_ps(_simd16_castsi_ps(vMaskV
));
1312 triMask
&= ~cullCenterMask
;
1314 if (origTriMask
^ triMask
)
1316 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1320 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1321 // Gather the AOS effective scissor rects based on the per-prim VP index.
1322 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1323 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1325 if (state
.gsState
.emitsViewportArrayIndex
)
1327 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1328 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1330 else // broadcast fast path for non-VPAI case.
1332 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1333 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1334 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1335 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1338 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
1339 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
1340 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
1341 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
1343 if (CT::IsConservativeT::value
)
1345 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1346 // some area. Bump the xmax/ymax edges out
1347 simd16scalari topEqualsBottom
= _simd16_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
1348 bbox
.ymax
= _simd16_blendv_epi32(bbox
.ymax
, _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), topEqualsBottom
);
1349 simd16scalari leftEqualsRight
= _simd16_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
1350 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), leftEqualsRight
);
1353 // Cull tris completely outside scissor
1355 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1356 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1357 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1358 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
1359 triMask
= triMask
& ~maskOutsideScissor
;
1364 goto endBinTriangles
;
1367 // Convert triangle bbox to macrotile units.
1368 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1369 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1370 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1371 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1373 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
1375 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
1376 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
1377 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
1378 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
1380 // transpose verts needed for backend
1381 /// @todo modify BE to take non-transformed verts
1382 __m128 vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1383 __m128 vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1384 __m128 vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1385 __m128 vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1387 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(tri
[0].x
, 0), _simd16_extract_ps(tri
[1].x
, 0), _simd16_extract_ps(tri
[2].x
, 0));
1388 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(tri
[0].y
, 0), _simd16_extract_ps(tri
[1].y
, 0), _simd16_extract_ps(tri
[2].y
, 0));
1389 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(tri
[0].z
, 0), _simd16_extract_ps(tri
[1].z
, 0), _simd16_extract_ps(tri
[2].z
, 0));
1390 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), _simd16_extract_ps(vRecipW2
, 0));
1392 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(tri
[0].x
, 1), _simd16_extract_ps(tri
[1].x
, 1), _simd16_extract_ps(tri
[2].x
, 1));
1393 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(tri
[0].y
, 1), _simd16_extract_ps(tri
[1].y
, 1), _simd16_extract_ps(tri
[2].y
, 1));
1394 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(tri
[0].z
, 1), _simd16_extract_ps(tri
[1].z
, 1), _simd16_extract_ps(tri
[2].z
, 1));
1395 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), _simd16_extract_ps(vRecipW2
, 1));
1397 // store render target array index
1398 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1399 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1401 simd16vector vRtai
[3];
1402 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
1403 simd16scalari vRtaii
;
1404 vRtaii
= _simd16_castps_si(vRtai
[0].x
);
1405 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1409 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1415 // scan remaining valid triangles and bin each separately
1416 while (_BitScanForward(&triIndex
, triMask
))
1418 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1419 uint32_t numScalarAttribs
= linkageCount
* 4;
1425 if (CT::IsConservativeT::value
)
1427 // only rasterize valid edges if we have a degenerate primitive
1428 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
1429 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1430 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
1432 // Degenerate triangles are required to be constant interpolated
1433 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
1437 isDegenerate
= false;
1438 work
.pfnWork
= pfnWork
;
1441 // Select attribute processor
1442 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
1443 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
1445 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1447 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
1448 desc
.triFlags
.primID
= pPrimID
[triIndex
];
1449 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
1450 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
1452 auto pArena
= pDC
->pArena
;
1453 SWR_ASSERT(pArena
!= nullptr);
1455 // store active attribs
1456 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1457 desc
.pAttribs
= pAttribs
;
1458 desc
.numAttribs
= linkageCount
;
1459 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
1461 // store triangle vertex data
1462 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1465 const uint32_t i
= triIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
1466 const uint32_t j
= triIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
1468 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
1469 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
1470 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
1471 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
1474 // store user clip distances
1475 if (rastState
.clipDistanceMask
)
1477 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1478 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1479 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1482 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
1484 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
1486 #if KNOB_ENABLE_TOSS_POINTS
1487 if (!KNOB_TOSS_SETUP_TRIS
)
1490 pTileMgr
->enqueue(x
, y
, &work
);
1495 triMask
&= ~(1 << triIndex
);
1498 AR_END(FEBinTriangles
, 1);
1502 struct FEBinTrianglesChooser
1504 typedef PFN_PROCESS_PRIMS FuncType
;
1506 template <typename
... ArgsB
>
1507 static FuncType
GetFunc()
1509 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
1513 // Selector for correct templated BinTrinagles function
1514 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
1516 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
1519 #if USE_SIMD16_FRONTEND
1520 struct FEBinTrianglesChooser_simd16
1522 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType
;
1524 template <typename
... ArgsB
>
1525 static FuncType
GetFunc()
1527 return BinTriangles_simd16
<ConservativeRastFETraits
<ArgsB
...>>;
1531 // Selector for correct templated BinTrinagles function
1532 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
)
1534 return TemplateArgUnroller
<FEBinTrianglesChooser_simd16
>::GetFunc(IsConservative
);
1539 void BinPostSetupPoints(
1546 simdscalari viewportIdx
)
1548 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1550 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1552 simdvector
& primVerts
= prim
[0];
1554 const API_STATE
& state
= GetApiState(pDC
);
1555 const SWR_GS_STATE
& gsState
= state
.gsState
;
1556 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1557 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1559 // Select attribute processor
1560 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1561 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1563 // convert to fixed point
1564 simdscalari vXi
, vYi
;
1565 vXi
= fpToFixedPointVertical(primVerts
.x
);
1566 vYi
= fpToFixedPointVertical(primVerts
.y
);
1568 if (CanUseSimplePoints(pDC
))
1570 // adjust for ymin-xmin rule
1571 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
1572 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
1574 // cull points off the ymin-xmin edge of the viewport
1575 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
1576 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
1578 // compute macro tile coordinates
1579 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1580 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1582 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
1583 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
1584 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
1586 // compute raster tile coordinates
1587 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1588 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1590 // compute raster tile relative x,y for coverage mask
1591 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1592 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1594 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1595 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1597 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
1598 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
1599 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
1600 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
1602 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
1603 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
1604 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
1605 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
1607 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
1608 _simd_store_ps((float*)aZ
, primVerts
.z
);
1610 // store render target array index
1611 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1612 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1615 pa
.Assemble(VERTEX_RTAI_SLOT
, &vRtai
);
1616 simdscalari vRtaii
= _simd_castps_si(vRtai
.x
);
1617 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1621 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1624 uint32_t *pPrimID
= (uint32_t *)&primID
;
1625 DWORD primIndex
= 0;
1627 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1629 // scan remaining valid triangles and bin each separately
1630 while (_BitScanForward(&primIndex
, primMask
))
1632 uint32_t linkageCount
= backendState
.numAttributes
;
1633 uint32_t numScalarAttribs
= linkageCount
* 4;
1638 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1640 // points are always front facing
1641 desc
.triFlags
.frontFacing
= 1;
1642 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1643 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1644 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1646 work
.pfnWork
= RasterizeSimplePoint
;
1648 auto pArena
= pDC
->pArena
;
1649 SWR_ASSERT(pArena
!= nullptr);
1652 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1653 desc
.pAttribs
= pAttribs
;
1654 desc
.numAttribs
= linkageCount
;
1656 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1658 // store raster tile aligned x, y, perspective correct z
1659 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1660 desc
.pTriBuffer
= pTriBuffer
;
1661 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1662 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1663 *pTriBuffer
= aZ
[primIndex
];
1665 uint32_t tX
= aTileRelativeX
[primIndex
];
1666 uint32_t tY
= aTileRelativeY
[primIndex
];
1668 // pack the relative x,y into the coverageMask, the rasterizer will
1669 // generate the true coverage mask from it
1670 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1673 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1674 #if KNOB_ENABLE_TOSS_POINTS
1675 if (!KNOB_TOSS_SETUP_TRIS
)
1678 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1680 primMask
&= ~(1 << primIndex
);
1685 // non simple points need to be potentially binned to multiple macro tiles
1686 simdscalar vPointSize
;
1687 if (rastState
.pointParam
)
1690 pa
.Assemble(VERTEX_POINT_SIZE_SLOT
, size
);
1691 vPointSize
= size
[0].x
;
1695 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
1698 // bloat point to bbox
1700 bbox
.xmin
= bbox
.xmax
= vXi
;
1701 bbox
.ymin
= bbox
.ymax
= vYi
;
1703 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
1704 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1705 bbox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1706 bbox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
1707 bbox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1708 bbox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
1710 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1711 // Gather the AOS effective scissor rects based on the per-prim VP index.
1712 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1713 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1714 if (state
.gsState
.emitsViewportArrayIndex
)
1716 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1717 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1719 else // broadcast fast path for non-VPAI case.
1721 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1722 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1723 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1724 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1727 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
1728 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
1729 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
1730 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
1732 // Cull bloated points completely outside scissor
1733 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1734 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1735 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1736 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1737 primMask
= primMask
& ~maskOutsideScissor
;
1739 // Convert bbox to macrotile units.
1740 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1741 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1742 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1743 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1745 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1746 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
1747 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
1748 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
1749 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
1751 // store render target array index
1752 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1753 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1755 simdvector vRtai
[2];
1756 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
1757 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
1758 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1762 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1765 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
1766 _simd_store_ps((float*)aPointSize
, vPointSize
);
1768 uint32_t *pPrimID
= (uint32_t *)&primID
;
1770 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
1771 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
1772 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
1774 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
1775 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
1776 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
1778 // scan remaining valid prims and bin each separately
1779 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1781 while (_BitScanForward(&primIndex
, primMask
))
1783 uint32_t linkageCount
= backendState
.numAttributes
;
1784 uint32_t numScalarAttribs
= linkageCount
* 4;
1789 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1791 desc
.triFlags
.frontFacing
= 1;
1792 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1793 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1794 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1795 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1797 work
.pfnWork
= RasterizeTriPoint
;
1799 auto pArena
= pDC
->pArena
;
1800 SWR_ASSERT(pArena
!= nullptr);
1802 // store active attribs
1803 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1804 desc
.numAttribs
= linkageCount
;
1805 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1807 // store point vertex data
1808 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1809 desc
.pTriBuffer
= pTriBuffer
;
1810 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
1811 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
1812 *pTriBuffer
= aPrimVertsZ
[primIndex
];
1814 // store user clip distances
1815 if (rastState
.clipDistanceMask
)
1817 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1818 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1821 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
1822 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
1823 desc
.pUserClipBuffer
[3*i
+ 0] = 0.0f
;
1824 desc
.pUserClipBuffer
[3*i
+ 1] = 0.0f
;
1825 desc
.pUserClipBuffer
[3*i
+ 2] = dists
[i
];
1829 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1830 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1832 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1834 #if KNOB_ENABLE_TOSS_POINTS
1835 if (!KNOB_TOSS_SETUP_TRIS
)
1838 pTileMgr
->enqueue(x
, y
, &work
);
1843 primMask
&= ~(1 << primIndex
);
1847 AR_END(FEBinPoints
, 1);
1850 //////////////////////////////////////////////////////////////////////////
1851 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1852 /// @param pDC - pointer to draw context.
1853 /// @param pa - The primitive assembly object.
1854 /// @param workerId - thread's worker id. Even thread has a unique id.
1855 /// @param tri - Contains point position data for SIMDs worth of points.
1856 /// @param primID - Primitive ID for each point.
1864 simdscalari viewportIdx
)
1866 simdvector
& primVerts
= prim
[0];
1868 const API_STATE
& state
= GetApiState(pDC
);
1869 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1870 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1872 if (!feState
.vpTransformDisable
)
1874 // perspective divide
1875 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
1876 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
1877 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
1878 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
1880 // viewport transform to screen coords
1881 if (state
.gsState
.emitsViewportArrayIndex
)
1883 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
1887 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
1891 // adjust for pixel center location
1892 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1893 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
1894 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
1906 #if USE_SIMD16_FRONTEND
1907 void BinPostSetupPoints_simd16(
1911 simd16vector prim
[],
1913 simd16scalari primID
,
1914 simd16scalari viewportIdx
)
1916 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1918 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1920 simd16vector
& primVerts
= prim
[0];
1922 const API_STATE
& state
= GetApiState(pDC
);
1923 const SWR_GS_STATE
& gsState
= state
.gsState
;
1924 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1925 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1927 // Select attribute processor
1928 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1929 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1931 // convert to fixed point
1932 simd16scalari vXi
, vYi
;
1934 vXi
= fpToFixedPointVertical(primVerts
.x
);
1935 vYi
= fpToFixedPointVertical(primVerts
.y
);
1937 if (CanUseSimplePoints(pDC
))
1939 // adjust for ymin-xmin rule
1940 vXi
= _simd16_sub_epi32(vXi
, _simd16_set1_epi32(1));
1941 vYi
= _simd16_sub_epi32(vYi
, _simd16_set1_epi32(1));
1943 // cull points off the ymin-xmin edge of the viewport
1944 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi
));
1945 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi
));
1947 // compute macro tile coordinates
1948 simd16scalari macroX
= _simd16_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1949 simd16scalari macroY
= _simd16_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1951 OSALIGNSIMD16(uint32_t) aMacroX
[KNOB_SIMD16_WIDTH
], aMacroY
[KNOB_SIMD16_WIDTH
];
1953 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroX
), macroX
);
1954 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroY
), macroY
);
1956 // compute raster tile coordinates
1957 simd16scalari rasterX
= _simd16_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1958 simd16scalari rasterY
= _simd16_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1960 // compute raster tile relative x,y for coverage mask
1961 simd16scalari tileAlignedX
= _simd16_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1962 simd16scalari tileAlignedY
= _simd16_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1964 simd16scalari tileRelativeX
= _simd16_sub_epi32(_simd16_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1965 simd16scalari tileRelativeY
= _simd16_sub_epi32(_simd16_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1967 OSALIGNSIMD16(uint32_t) aTileRelativeX
[KNOB_SIMD16_WIDTH
];
1968 OSALIGNSIMD16(uint32_t) aTileRelativeY
[KNOB_SIMD16_WIDTH
];
1970 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeX
), tileRelativeX
);
1971 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeY
), tileRelativeY
);
1973 OSALIGNSIMD16(uint32_t) aTileAlignedX
[KNOB_SIMD16_WIDTH
];
1974 OSALIGNSIMD16(uint32_t) aTileAlignedY
[KNOB_SIMD16_WIDTH
];
1976 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedX
), tileAlignedX
);
1977 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedY
), tileAlignedY
);
1979 OSALIGNSIMD16(float) aZ
[KNOB_SIMD16_WIDTH
];
1980 _simd16_store_ps(reinterpret_cast<float *>(aZ
), primVerts
.z
);
1982 // store render target array index
1983 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1984 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1987 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, &vRtai
);
1988 simd16scalari vRtaii
= _simd16_castps_si(vRtai
.x
);
1989 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1993 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1996 uint32_t *pPrimID
= (uint32_t *)&primID
;
1997 DWORD primIndex
= 0;
1999 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
2001 // scan remaining valid triangles and bin each separately
2002 while (_BitScanForward(&primIndex
, primMask
))
2004 uint32_t linkageCount
= backendState
.numAttributes
;
2005 uint32_t numScalarAttribs
= linkageCount
* 4;
2010 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2012 // points are always front facing
2013 desc
.triFlags
.frontFacing
= 1;
2014 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2015 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2016 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2018 work
.pfnWork
= RasterizeSimplePoint
;
2020 auto pArena
= pDC
->pArena
;
2021 SWR_ASSERT(pArena
!= nullptr);
2024 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
2025 desc
.pAttribs
= pAttribs
;
2026 desc
.numAttribs
= linkageCount
;
2028 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
2030 // store raster tile aligned x, y, perspective correct z
2031 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2032 desc
.pTriBuffer
= pTriBuffer
;
2033 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
2034 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
2035 *pTriBuffer
= aZ
[primIndex
];
2037 uint32_t tX
= aTileRelativeX
[primIndex
];
2038 uint32_t tY
= aTileRelativeY
[primIndex
];
2040 // pack the relative x,y into the coverageMask, the rasterizer will
2041 // generate the true coverage mask from it
2042 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
2045 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2046 #if KNOB_ENABLE_TOSS_POINTS
2047 if (!KNOB_TOSS_SETUP_TRIS
)
2050 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
2053 primMask
&= ~(1 << primIndex
);
2058 // non simple points need to be potentially binned to multiple macro tiles
2059 simd16scalar vPointSize
;
2061 if (rastState
.pointParam
)
2063 simd16vector size
[3];
2064 pa
.Assemble_simd16(VERTEX_POINT_SIZE_SLOT
, size
);
2065 vPointSize
= size
[0].x
;
2069 vPointSize
= _simd16_set1_ps(rastState
.pointSize
);
2072 // bloat point to bbox
2075 bbox
.xmin
= bbox
.xmax
= vXi
;
2076 bbox
.ymin
= bbox
.ymax
= vYi
;
2078 simd16scalar vHalfWidth
= _simd16_mul_ps(vPointSize
, _simd16_set1_ps(0.5f
));
2079 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2081 bbox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2082 bbox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
2083 bbox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2084 bbox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
2086 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2087 // Gather the AOS effective scissor rects based on the per-prim VP index.
2088 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2089 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2090 if (state
.gsState
.emitsViewportArrayIndex
)
2092 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2093 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2095 else // broadcast fast path for non-VPAI case.
2097 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2098 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2099 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2100 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2103 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
2104 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
2105 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
2106 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
2108 // Cull bloated points completely outside scissor
2109 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2110 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2111 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2112 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
2113 primMask
= primMask
& ~maskOutsideScissor
;
2115 // Convert bbox to macrotile units.
2116 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2117 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2118 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2119 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2121 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
2123 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
2124 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
2125 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
2126 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
2128 // store render target array index
2129 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
2130 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2132 simd16vector vRtai
[2];
2133 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
2134 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0].x
);
2135 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
2139 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
2142 OSALIGNSIMD16(float) aPointSize
[KNOB_SIMD16_WIDTH
];
2143 _simd16_store_ps(reinterpret_cast<float *>(aPointSize
), vPointSize
);
2145 uint32_t *pPrimID
= (uint32_t *)&primID
;
2147 OSALIGNSIMD16(float) aPrimVertsX
[KNOB_SIMD16_WIDTH
];
2148 OSALIGNSIMD16(float) aPrimVertsY
[KNOB_SIMD16_WIDTH
];
2149 OSALIGNSIMD16(float) aPrimVertsZ
[KNOB_SIMD16_WIDTH
];
2151 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX
), primVerts
.x
);
2152 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY
), primVerts
.y
);
2153 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ
), primVerts
.z
);
2155 // scan remaining valid prims and bin each separately
2156 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
2158 while (_BitScanForward(&primIndex
, primMask
))
2160 uint32_t linkageCount
= backendState
.numAttributes
;
2161 uint32_t numScalarAttribs
= linkageCount
* 4;
2166 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2168 desc
.triFlags
.frontFacing
= 1;
2169 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2170 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
2171 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2172 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2174 work
.pfnWork
= RasterizeTriPoint
;
2176 auto pArena
= pDC
->pArena
;
2177 SWR_ASSERT(pArena
!= nullptr);
2179 // store active attribs
2180 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2181 desc
.numAttribs
= linkageCount
;
2182 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2184 // store point vertex data
2185 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2186 desc
.pTriBuffer
= pTriBuffer
;
2187 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
2188 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
2189 *pTriBuffer
= aPrimVertsZ
[primIndex
];
2191 // store user clip distances
2192 if (rastState
.clipDistanceMask
)
2194 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2195 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
2198 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
2199 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
2200 desc
.pUserClipBuffer
[3 * i
+ 0] = 0.0f
;
2201 desc
.pUserClipBuffer
[3 * i
+ 1] = 0.0f
;
2202 desc
.pUserClipBuffer
[3 * i
+ 2] = dists
[i
];
2206 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2207 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2209 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2211 #if KNOB_ENABLE_TOSS_POINTS
2212 if (!KNOB_TOSS_SETUP_TRIS
)
2215 pTileMgr
->enqueue(x
, y
, &work
);
2220 primMask
&= ~(1 << primIndex
);
2224 AR_END(FEBinPoints
, 1);
2227 void SIMDAPI
BinPoints_simd16(
2231 simd16vector prim
[3],
2233 simd16scalari primID
,
2234 simd16scalari viewportIdx
)
2236 simd16vector
& primVerts
= prim
[0];
2238 const API_STATE
& state
= GetApiState(pDC
);
2239 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2240 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2242 if (!feState
.vpTransformDisable
)
2244 // perspective divide
2245 simd16scalar vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), primVerts
.w
);
2247 primVerts
.x
= _simd16_mul_ps(primVerts
.x
, vRecipW0
);
2248 primVerts
.y
= _simd16_mul_ps(primVerts
.y
, vRecipW0
);
2249 primVerts
.z
= _simd16_mul_ps(primVerts
.z
, vRecipW0
);
2251 // viewport transform to screen coords
2252 if (state
.gsState
.emitsViewportArrayIndex
)
2254 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
2258 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
2262 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
2264 primVerts
.x
= _simd16_add_ps(primVerts
.x
, offset
);
2265 primVerts
.y
= _simd16_add_ps(primVerts
.y
, offset
);
2267 BinPostSetupPoints_simd16(
2278 //////////////////////////////////////////////////////////////////////////
2279 /// @brief Bin SIMD lines to the backend.
2280 /// @param pDC - pointer to draw context.
2281 /// @param pa - The primitive assembly object.
2282 /// @param workerId - thread's worker id. Even thread has a unique id.
2283 /// @param tri - Contains line position data for SIMDs worth of points.
2284 /// @param primID - Primitive ID for each line.
2285 /// @param viewportIdx - Viewport Array Index for each line.
2286 void BinPostSetupLines(
2291 simdscalar recipW
[],
2294 simdscalari viewportIdx
)
2296 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2298 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2300 const API_STATE
& state
= GetApiState(pDC
);
2301 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2302 const SWR_GS_STATE
& gsState
= state
.gsState
;
2304 // Select attribute processor
2305 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2306 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2308 simdscalar
& vRecipW0
= recipW
[0];
2309 simdscalar
& vRecipW1
= recipW
[1];
2311 // convert to fixed point
2312 simdscalari vXi
[2], vYi
[2];
2313 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2314 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2315 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2316 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2318 // compute x-major vs y-major mask
2319 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
2320 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
2321 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
2322 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
2324 // cull zero-length lines
2325 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
2326 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
2328 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
2330 uint32_t *pPrimID
= (uint32_t *)&primID
;
2331 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2333 simdscalar vUnused
= _simd_setzero_ps();
2335 // Calc bounding box of lines
2337 bbox
.xmin
= _simd_min_epi32(vXi
[0], vXi
[1]);
2338 bbox
.xmax
= _simd_max_epi32(vXi
[0], vXi
[1]);
2339 bbox
.ymin
= _simd_min_epi32(vYi
[0], vYi
[1]);
2340 bbox
.ymax
= _simd_max_epi32(vYi
[0], vYi
[1]);
2342 // bloat bbox by line width along minor axis
2343 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
2344 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2346 bloatBox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2347 bloatBox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
2348 bloatBox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2349 bloatBox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
2351 bbox
.xmin
= _simd_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2352 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2353 bbox
.ymin
= _simd_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2354 bbox
.ymax
= _simd_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2356 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2357 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2358 if (state
.gsState
.emitsViewportArrayIndex
)
2360 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2361 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2363 else // broadcast fast path for non-VPAI case.
2365 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2366 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2367 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2368 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2371 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
2372 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
2373 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
2374 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
2376 // Cull prims completely outside scissor
2378 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2379 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2380 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2381 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2382 primMask
= primMask
& ~maskOutsideScissor
;
2390 // Convert triangle bbox to macrotile units.
2391 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2392 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2393 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2394 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2396 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2397 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
2398 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
2399 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
2400 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
2402 // transpose verts needed for backend
2403 /// @todo modify BE to take non-transformed verts
2404 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2405 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
2406 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
2407 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
2408 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
2410 // store render target array index
2411 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2412 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2414 simdvector vRtai
[2];
2415 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2416 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2417 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2421 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2424 // scan remaining valid prims and bin each separately
2426 while (_BitScanForward(&primIndex
, primMask
))
2428 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2429 uint32_t numScalarAttribs
= linkageCount
* 4;
2434 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2436 desc
.triFlags
.frontFacing
= 1;
2437 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2438 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2439 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2440 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2442 work
.pfnWork
= RasterizeLine
;
2444 auto pArena
= pDC
->pArena
;
2445 SWR_ASSERT(pArena
!= nullptr);
2447 // store active attribs
2448 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2449 desc
.numAttribs
= linkageCount
;
2450 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2452 // store line vertex data
2453 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2454 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
2455 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
2456 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
2457 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
2459 // store user clip distances
2460 if (rastState
.clipDistanceMask
)
2462 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2463 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2464 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2467 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2468 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2470 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2472 #if KNOB_ENABLE_TOSS_POINTS
2473 if (!KNOB_TOSS_SETUP_TRIS
)
2476 pTileMgr
->enqueue(x
, y
, &work
);
2481 primMask
&= ~(1 << primIndex
);
2486 AR_END(FEBinLines
, 1);
2489 #if USE_SIMD16_FRONTEND
2490 void BinPostSetupLines_simd16(
2494 simd16vector prim
[],
2495 simd16scalar recipW
[],
2497 simd16scalari primID
,
2498 simd16scalari viewportIdx
)
2500 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2502 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2504 const API_STATE
& state
= GetApiState(pDC
);
2505 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2506 const SWR_GS_STATE
& gsState
= state
.gsState
;
2508 // Select attribute processor
2509 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2510 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2512 simd16scalar
& vRecipW0
= recipW
[0];
2513 simd16scalar
& vRecipW1
= recipW
[1];
2515 // convert to fixed point
2516 simd16scalari vXi
[2], vYi
[2];
2518 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2519 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2520 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2521 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2523 // compute x-major vs y-major mask
2524 simd16scalari xLength
= _simd16_abs_epi32(_simd16_sub_epi32(vXi
[0], vXi
[1]));
2525 simd16scalari yLength
= _simd16_abs_epi32(_simd16_sub_epi32(vYi
[0], vYi
[1]));
2526 simd16scalar vYmajorMask
= _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength
, xLength
));
2527 uint32_t yMajorMask
= _simd16_movemask_ps(vYmajorMask
);
2529 // cull zero-length lines
2530 simd16scalari vZeroLengthMask
= _simd16_cmpeq_epi32(xLength
, _simd16_setzero_si());
2531 vZeroLengthMask
= _simd16_and_si(vZeroLengthMask
, _simd16_cmpeq_epi32(yLength
, _simd16_setzero_si()));
2533 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask
));
2535 uint32_t *pPrimID
= (uint32_t *)&primID
;
2536 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2538 // Calc bounding box of lines
2540 bbox
.xmin
= _simd16_min_epi32(vXi
[0], vXi
[1]);
2541 bbox
.xmax
= _simd16_max_epi32(vXi
[0], vXi
[1]);
2542 bbox
.ymin
= _simd16_min_epi32(vYi
[0], vYi
[1]);
2543 bbox
.ymax
= _simd16_max_epi32(vYi
[0], vYi
[1]);
2545 // bloat bbox by line width along minor axis
2546 simd16scalar vHalfWidth
= _simd16_set1_ps(rastState
.lineWidth
/ 2.0f
);
2547 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2549 simd16BBox bloatBox
;
2551 bloatBox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2552 bloatBox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
2553 bloatBox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2554 bloatBox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
2556 bbox
.xmin
= _simd16_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2557 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2558 bbox
.ymin
= _simd16_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2559 bbox
.ymax
= _simd16_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2561 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2562 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2564 if (state
.gsState
.emitsViewportArrayIndex
)
2566 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2567 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2569 else // broadcast fast path for non-VPAI case.
2571 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2572 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2573 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2574 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2577 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
2578 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
2579 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
2580 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
2582 // Cull prims completely outside scissor
2584 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2585 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2586 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2587 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
2588 primMask
= primMask
& ~maskOutsideScissor
;
2591 const simdscalar unused
= _simd_setzero_ps();
2598 // Convert triangle bbox to macrotile units.
2599 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2600 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2601 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2602 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2604 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
2606 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
2607 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
2608 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
2609 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
2611 // transpose verts needed for backend
2612 /// @todo modify BE to take non-transformed verts
2613 __m128 vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2614 __m128 vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2615 __m128 vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2616 __m128 vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2618 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(prim
[0].x
, 0), _simd16_extract_ps(prim
[1].x
, 0), unused
);
2619 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(prim
[0].y
, 0), _simd16_extract_ps(prim
[1].y
, 0), unused
);
2620 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(prim
[0].z
, 0), _simd16_extract_ps(prim
[1].z
, 0), unused
);
2621 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), unused
);
2623 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(prim
[0].x
, 1), _simd16_extract_ps(prim
[1].x
, 1), unused
);
2624 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(prim
[0].y
, 1), _simd16_extract_ps(prim
[1].y
, 1), unused
);
2625 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(prim
[0].z
, 1), _simd16_extract_ps(prim
[1].z
, 1), unused
);
2626 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), unused
);
2628 // store render target array index
2629 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
2630 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2632 simd16vector vRtai
[2];
2633 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
2634 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0].x
);
2635 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
2639 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
2642 // scan remaining valid prims and bin each separately
2644 while (_BitScanForward(&primIndex
, primMask
))
2646 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2647 uint32_t numScalarAttribs
= linkageCount
* 4;
2652 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2654 desc
.triFlags
.frontFacing
= 1;
2655 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2656 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2657 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2658 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2660 work
.pfnWork
= RasterizeLine
;
2662 auto pArena
= pDC
->pArena
;
2663 SWR_ASSERT(pArena
!= nullptr);
2665 // store active attribs
2666 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2667 desc
.numAttribs
= linkageCount
;
2668 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2670 // store line vertex data
2671 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2674 const uint32_t i
= primIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
2675 const uint32_t j
= primIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
2677 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
2678 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
2679 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
2680 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
2683 // store user clip distances
2684 if (rastState
.clipDistanceMask
)
2686 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2687 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2688 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2691 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2692 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2694 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2696 #if KNOB_ENABLE_TOSS_POINTS
2697 if (!KNOB_TOSS_SETUP_TRIS
)
2700 pTileMgr
->enqueue(x
, y
, &work
);
2705 primMask
&= ~(1 << primIndex
);
2710 AR_END(FEBinLines
, 1);
2714 //////////////////////////////////////////////////////////////////////////
2715 /// @brief Bin SIMD lines to the backend.
2716 /// @param pDC - pointer to draw context.
2717 /// @param pa - The primitive assembly object.
2718 /// @param workerId - thread's worker id. Even thread has a unique id.
2719 /// @param tri - Contains line position data for SIMDs worth of points.
2720 /// @param primID - Primitive ID for each line.
2721 /// @param viewportIdx - Viewport Array Index for each line.
2729 simdscalari viewportIdx
)
2731 const API_STATE
& state
= GetApiState(pDC
);
2732 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2733 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2735 simdscalar vRecipW
[2] = { _simd_set1_ps(1.0f
), _simd_set1_ps(1.0f
) };
2737 if (!feState
.vpTransformDisable
)
2739 // perspective divide
2740 vRecipW
[0] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
2741 vRecipW
[1] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
2743 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2744 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2746 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2747 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2749 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2750 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2752 // viewport transform to screen coords
2753 if (state
.gsState
.emitsViewportArrayIndex
)
2755 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2759 viewportTransform
<2>(prim
, state
.vpMatrices
);
2763 // adjust for pixel center location
2764 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2765 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
2766 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
2768 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
2769 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);
2782 #if USE_SIMD16_FRONTEND
2783 void SIMDAPI
BinLines_simd16(
2787 simd16vector prim
[3],
2789 simd16scalari primID
,
2790 simd16scalari viewportIdx
)
2792 const API_STATE
& state
= GetApiState(pDC
);
2793 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2794 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2796 simd16scalar vRecipW
[2] = { _simd16_set1_ps(1.0f
), _simd16_set1_ps(1.0f
) };
2798 if (!feState
.vpTransformDisable
)
2800 // perspective divide
2801 vRecipW
[0] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[0].w
);
2802 vRecipW
[1] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[1].w
);
2804 prim
[0].v
[0] = _simd16_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2805 prim
[1].v
[0] = _simd16_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2807 prim
[0].v
[1] = _simd16_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2808 prim
[1].v
[1] = _simd16_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2810 prim
[0].v
[2] = _simd16_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2811 prim
[1].v
[2] = _simd16_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2813 // viewport transform to screen coords
2814 if (state
.gsState
.emitsViewportArrayIndex
)
2816 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2820 viewportTransform
<2>(prim
, state
.vpMatrices
);
2824 // adjust for pixel center location
2825 simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
2827 prim
[0].x
= _simd16_add_ps(prim
[0].x
, offset
);
2828 prim
[0].y
= _simd16_add_ps(prim
[0].y
, offset
);
2830 prim
[1].x
= _simd16_add_ps(prim
[1].x
, offset
);
2831 prim
[1].y
= _simd16_add_ps(prim
[1].y
, offset
);
2833 BinPostSetupLines_simd16(