1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for the macrotile binner
27 ******************************************************************************/
31 #include "conservativeRast.h"
33 #include "rasterizer.h"
34 #include "rdtsc_core.h"
38 void BinPostSetupLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], simdscalar vRecipW
[2], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
40 #if USE_SIMD16_FRONTEND
41 void BinPostSetupLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], simd16scalar vRecipW
[2], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Offsets added to post-viewport vertex positions based on
47 static const simdscalar g_pixelOffsets
[SWR_PIXEL_LOCATION_UL
+ 1] =
49 _simd_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
50 _simd_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
53 #if USE_SIMD16_FRONTEND
54 static const simd16scalar g_pixelOffsets_simd16
[SWR_PIXEL_LOCATION_UL
+ 1] =
56 _simd16_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
57 _simd16_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
63 /// Point precision from FP32.
64 template <typename PT
= FixedPointTraits
<Fixed_16_8
>>
65 INLINE simdscalari
fpToFixedPointVertical(const simdscalar vIn
)
67 simdscalar vFixed
= _simd_mul_ps(vIn
, _simd_set1_ps(PT::ScaleT::value
));
68 return _simd_cvtps_epi32(vFixed
);
71 #if USE_SIMD16_FRONTEND
72 template <typename PT
= FixedPointTraits
<Fixed_16_8
>>
73 INLINE simd16scalari
fpToFixedPointVertical(const simd16scalar vIn
)
75 simd16scalar vFixed
= _simd16_mul_ps(vIn
, _simd16_set1_ps(PT::ScaleT::value
));
76 return _simd16_cvtps_epi32(vFixed
);
80 //////////////////////////////////////////////////////////////////////////
81 /// @brief Helper function to set the X,Y coords of a triangle to the
82 /// requested Fixed Point precision from FP32.
83 /// @param tri: simdvector[3] of FP triangle verts
84 /// @param vXi: fixed point X coords of tri verts
85 /// @param vYi: fixed point Y coords of tri verts
86 INLINE
static void FPToFixedPoint(const simdvector
* const tri
, simdscalari(&vXi
)[3], simdscalari(&vYi
)[3])
88 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
89 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
90 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
91 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
92 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
93 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
96 #if USE_SIMD16_FRONTEND
97 INLINE
static void FPToFixedPoint(const simd16vector
* const tri
, simd16scalari(&vXi
)[3], simd16scalari(&vYi
)[3])
99 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
100 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
101 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
102 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
103 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
104 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief Calculate bounding box for current triangle
110 /// @tparam CT: ConservativeRastFETraits type
111 /// @param vX: fixed point X position for triangle verts
112 /// @param vY: fixed point Y position for triangle verts
113 /// @param bbox: fixed point bbox
114 /// *Note*: expects vX, vY to be in the correct precision for the type
115 /// of rasterization. This avoids unnecessary FP->fixed conversions.
116 template <typename CT
>
117 INLINE
void calcBoundingBoxIntVertical(const simdvector
* const tri
, simdscalari(&vX
)[3], simdscalari(&vY
)[3], simdBBox
&bbox
)
119 simdscalari vMinX
= vX
[0];
120 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
121 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
123 simdscalari vMaxX
= vX
[0];
124 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
125 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
127 simdscalari vMinY
= vY
[0];
128 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
129 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
131 simdscalari vMaxY
= vY
[0];
132 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
133 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
141 #if USE_SIMD16_FRONTEND
142 template <typename CT
>
143 INLINE
void calcBoundingBoxIntVertical(const simd16vector
* const tri
, simd16scalari(&vX
)[3], simd16scalari(&vY
)[3], simd16BBox
&bbox
)
145 simd16scalari vMinX
= vX
[0];
147 vMinX
= _simd16_min_epi32(vMinX
, vX
[1]);
148 vMinX
= _simd16_min_epi32(vMinX
, vX
[2]);
150 simd16scalari vMaxX
= vX
[0];
152 vMaxX
= _simd16_max_epi32(vMaxX
, vX
[1]);
153 vMaxX
= _simd16_max_epi32(vMaxX
, vX
[2]);
155 simd16scalari vMinY
= vY
[0];
157 vMinY
= _simd16_min_epi32(vMinY
, vY
[1]);
158 vMinY
= _simd16_min_epi32(vMinY
, vY
[2]);
160 simd16scalari vMaxY
= vY
[0];
162 vMaxY
= _simd16_max_epi32(vMaxY
, vY
[1]);
163 vMaxY
= _simd16_max_epi32(vMaxY
, vY
[2]);
172 //////////////////////////////////////////////////////////////////////////
173 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
174 /// Offsets BBox for conservative rast
176 INLINE
void calcBoundingBoxIntVertical
<FEConservativeRastT
>(const simdvector
* const tri
, simdscalari(&vX
)[3], simdscalari(&vY
)[3], simdBBox
&bbox
)
178 // FE conservative rast traits
179 typedef FEConservativeRastT CT
;
181 simdscalari vMinX
= vX
[0];
182 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
183 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
185 simdscalari vMaxX
= vX
[0];
186 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
187 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
189 simdscalari vMinY
= vY
[0];
190 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
191 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
193 simdscalari vMaxY
= vY
[0];
194 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
195 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
197 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
198 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
199 bbox
.xmin
= _simd_sub_epi32(vMinX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
200 bbox
.xmax
= _simd_add_epi32(vMaxX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
201 bbox
.ymin
= _simd_sub_epi32(vMinY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
202 bbox
.ymax
= _simd_add_epi32(vMaxY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
205 //////////////////////////////////////////////////////////////////////////
206 /// @brief Processes attributes for the backend based on linkage mask and
207 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
208 /// @param pDC - Draw context
209 /// @param pa - Primitive Assembly state
210 /// @param linkageMask - Specifies which VS outputs are routed to PS.
211 /// @param pLinkageMap - maps VS attribute slot to PS slot
212 /// @param triIndex - Triangle to process attributes for
213 /// @param pBuffer - Output result
214 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
215 INLINE
void ProcessAttributes(
222 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
223 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
224 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
225 LONG constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
226 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
227 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
229 static const float constTable
[3][4] = {
230 { 0.0f
, 0.0f
, 0.0f
, 0.0f
},
231 { 0.0f
, 0.0f
, 0.0f
, 1.0f
},
232 { 1.0f
, 1.0f
, 1.0f
, 1.0f
}
235 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
238 if (IsSwizzledT::value
)
240 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
241 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ attribSwizzle
.sourceAttrib
;
246 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ i
;
249 __m128 attrib
[3]; // triangle attribs (always 4 wide)
250 float* pAttribStart
= pBuffer
;
252 if (HasConstantInterpT::value
|| IsDegenerate::value
)
254 if (_bittest(&constantInterpMask
, i
))
257 uint32_t adjustedTriIndex
;
258 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
259 static const int32_t quadProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
260 static const uint32_t quadProvokingVertex
[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
261 static const int32_t qstripProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
262 static const uint32_t qstripProvokingVertex
[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
266 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
267 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
270 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
271 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
273 case TOP_TRIANGLE_STRIP
:
274 adjustedTriIndex
= triIndex
;
276 ? tristripProvokingVertex
[provokingVertex
]
280 adjustedTriIndex
= triIndex
;
281 vid
= provokingVertex
;
285 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
287 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
289 _mm_store_ps(pBuffer
, attrib
[vid
]);
295 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
297 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
299 _mm_store_ps(pBuffer
, attrib
[i
]);
306 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
308 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
310 _mm_store_ps(pBuffer
, attrib
[i
]);
315 // pad out the attrib buffer to 3 verts to ensure the triangle
316 // interpolation code in the pixel shader works correctly for the
317 // 3 topologies - point, line, tri. This effectively zeros out the
318 // effect of the missing vertices in the triangle interpolation.
319 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
321 _mm_store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
325 // check for constant source overrides
326 if (IsSwizzledT::value
)
328 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
332 while (_BitScanForward(&comp
, mask
))
334 mask
&= ~(1 << comp
);
336 float constantValue
= 0.0f
;
337 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
339 case SWR_CONSTANT_SOURCE_CONST_0000
:
340 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
341 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
342 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
344 case SWR_CONSTANT_SOURCE_PRIM_ID
:
345 constantValue
= *(float*)&primId
;
349 // apply constant value to all 3 vertices
350 for (uint32_t v
= 0; v
< 3; ++v
)
352 pAttribStart
[comp
+ v
* 4] = constantValue
;
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief Gather scissor rect data based on per-prim viewport indices.
362 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
363 /// @param pViewportIndex - array of per-primitive vewport indexes.
364 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
365 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
366 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
367 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
369 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
370 template<size_t SimdWidth
>
371 struct GatherScissors
373 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
374 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
375 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
377 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
382 struct GatherScissors
<8>
384 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
385 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
386 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
388 scisXmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
389 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
390 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
391 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
392 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
393 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
394 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
395 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
);
396 scisYmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
397 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
398 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
399 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
400 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
401 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
402 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
403 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
);
404 scisXmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
405 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
406 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
407 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
408 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
409 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
410 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
411 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
);
412 scisYmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
413 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
414 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
415 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
416 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
417 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
418 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
419 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
);
423 #if USE_SIMD16_FRONTEND
424 template<size_t SimdWidth
>
425 struct GatherScissors_simd16
427 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
428 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
429 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
431 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
436 struct GatherScissors_simd16
<16>
438 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
439 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
440 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
442 scisXmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
443 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
444 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
445 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
446 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
447 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
448 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
449 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
,
450 pScissorsInFixedPoint
[pViewportIndex
[8]].xmin
,
451 pScissorsInFixedPoint
[pViewportIndex
[9]].xmin
,
452 pScissorsInFixedPoint
[pViewportIndex
[10]].xmin
,
453 pScissorsInFixedPoint
[pViewportIndex
[11]].xmin
,
454 pScissorsInFixedPoint
[pViewportIndex
[12]].xmin
,
455 pScissorsInFixedPoint
[pViewportIndex
[13]].xmin
,
456 pScissorsInFixedPoint
[pViewportIndex
[14]].xmin
,
457 pScissorsInFixedPoint
[pViewportIndex
[15]].xmin
);
459 scisYmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
460 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
461 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
462 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
463 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
464 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
465 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
466 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
,
467 pScissorsInFixedPoint
[pViewportIndex
[8]].ymin
,
468 pScissorsInFixedPoint
[pViewportIndex
[9]].ymin
,
469 pScissorsInFixedPoint
[pViewportIndex
[10]].ymin
,
470 pScissorsInFixedPoint
[pViewportIndex
[11]].ymin
,
471 pScissorsInFixedPoint
[pViewportIndex
[12]].ymin
,
472 pScissorsInFixedPoint
[pViewportIndex
[13]].ymin
,
473 pScissorsInFixedPoint
[pViewportIndex
[14]].ymin
,
474 pScissorsInFixedPoint
[pViewportIndex
[15]].ymin
);
476 scisXmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
477 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
478 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
479 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
480 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
481 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
482 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
483 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
,
484 pScissorsInFixedPoint
[pViewportIndex
[8]].xmax
,
485 pScissorsInFixedPoint
[pViewportIndex
[9]].xmax
,
486 pScissorsInFixedPoint
[pViewportIndex
[10]].xmax
,
487 pScissorsInFixedPoint
[pViewportIndex
[11]].xmax
,
488 pScissorsInFixedPoint
[pViewportIndex
[12]].xmax
,
489 pScissorsInFixedPoint
[pViewportIndex
[13]].xmax
,
490 pScissorsInFixedPoint
[pViewportIndex
[14]].xmax
,
491 pScissorsInFixedPoint
[pViewportIndex
[15]].xmax
);
493 scisYmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
494 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
495 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
496 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
497 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
498 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
499 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
500 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
,
501 pScissorsInFixedPoint
[pViewportIndex
[8]].ymax
,
502 pScissorsInFixedPoint
[pViewportIndex
[9]].ymax
,
503 pScissorsInFixedPoint
[pViewportIndex
[10]].ymax
,
504 pScissorsInFixedPoint
[pViewportIndex
[11]].ymax
,
505 pScissorsInFixedPoint
[pViewportIndex
[12]].ymax
,
506 pScissorsInFixedPoint
[pViewportIndex
[13]].ymax
,
507 pScissorsInFixedPoint
[pViewportIndex
[14]].ymax
,
508 pScissorsInFixedPoint
[pViewportIndex
[15]].ymax
);
513 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
515 struct ProcessAttributesChooser
517 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
519 template <typename
... ArgsB
>
520 static FuncType
GetFunc()
522 return ProcessAttributes
<ArgsB
...>;
526 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
528 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
531 //////////////////////////////////////////////////////////////////////////
532 /// @brief Processes enabled user clip distances. Loads the active clip
533 /// distances from the PA, sets up barycentric equations, and
534 /// stores the results to the output buffer
535 /// @param pa - Primitive Assembly state
536 /// @param primIndex - primitive index to process
537 /// @param clipDistMask - mask of enabled clip distances
538 /// @param pUserClipBuffer - buffer to store results
539 template<uint32_t NumVerts
>
540 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float *pRecipW
, float* pUserClipBuffer
)
543 while (_BitScanForward(&clipDist
, clipDistMask
))
545 clipDistMask
&= ~(1 << clipDist
);
546 uint32_t clipSlot
= clipDist
>> 2;
547 uint32_t clipComp
= clipDist
& 0x3;
548 uint32_t clipAttribSlot
= clipSlot
== 0 ?
549 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
551 __m128 primClipDist
[3];
552 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
554 float vertClipDist
[NumVerts
];
555 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
557 OSALIGNSIMD(float) aVertClipDist
[4];
558 _mm_store_ps(aVertClipDist
, primClipDist
[e
]);
559 vertClipDist
[e
] = aVertClipDist
[clipComp
];
562 // setup plane equations for barycentric interpolation in the backend
563 float baryCoeff
[NumVerts
];
564 float last
= vertClipDist
[NumVerts
- 1] * pRecipW
[NumVerts
- 1];
565 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
567 baryCoeff
[e
] = vertClipDist
[e
] * pRecipW
[e
] - last
;
569 baryCoeff
[NumVerts
- 1] = last
;
571 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
573 *(pUserClipBuffer
++) = baryCoeff
[e
];
578 //////////////////////////////////////////////////////////////////////////
579 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
580 /// culling, viewport transform, etc.
581 /// @param pDC - pointer to draw context.
582 /// @param pa - The primitive assembly object.
583 /// @param workerId - thread's worker id. Even thread has a unique id.
584 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
585 /// @param primID - Primitive ID for each triangle.
586 /// @param viewportIdx - viewport array index for each triangle.
587 /// @tparam CT - ConservativeRastFETraits
588 template <typename CT
>
596 simdscalari viewportIdx
)
598 SWR_CONTEXT
*pContext
= pDC
->pContext
;
600 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
602 const API_STATE
& state
= GetApiState(pDC
);
603 const SWR_RASTSTATE
& rastState
= state
.rastState
;
604 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
605 const SWR_GS_STATE
& gsState
= state
.gsState
;
606 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
608 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
609 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
610 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
612 if (feState
.vpTransformDisable
)
614 // RHW is passed in directly when VP transform is disabled
615 vRecipW0
= tri
[0].v
[3];
616 vRecipW1
= tri
[1].v
[3];
617 vRecipW2
= tri
[2].v
[3];
621 // Perspective divide
622 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
623 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
624 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
626 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
627 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
628 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
630 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
631 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
632 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
634 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
635 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
636 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
638 // Viewport transform to screen space coords
639 if (state
.gsState
.emitsViewportArrayIndex
)
641 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
645 viewportTransform
<3>(tri
, state
.vpMatrices
);
649 // Adjust for pixel center location
650 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
651 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
652 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
654 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
655 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
657 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
658 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
660 simdscalari vXi
[3], vYi
[3];
661 // Set vXi, vYi to required fixed point precision
662 FPToFixedPoint(tri
, vXi
, vYi
);
665 simdscalari vAi
[3], vBi
[3];
666 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
670 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
673 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
674 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
676 int cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
678 uint32_t origTriMask
= triMask
;
679 // don't cull degenerate triangles if we're conservatively rasterizing
680 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
682 triMask
&= ~cullZeroAreaMask
;
685 // determine front winding tris
688 // 0 area triangles are marked as backfacing regardless of winding order,
689 // which is required behavior for conservative rast and wireframe rendering
690 uint32_t frontWindingTris
;
691 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
693 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
694 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
698 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[0])));
699 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[1])));
701 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
705 switch ((SWR_CULLMODE
)rastState
.cullMode
)
707 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
708 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
709 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
710 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
711 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
712 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
715 triMask
&= ~cullTris
;
717 if (origTriMask
^ triMask
)
719 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
722 // Simple non-conformant wireframe mode, useful for debugging
723 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
725 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
727 simdscalar recipW
[2];
730 recipW
[0] = vRecipW0
;
731 recipW
[1] = vRecipW1
;
732 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
736 recipW
[0] = vRecipW1
;
737 recipW
[1] = vRecipW2
;
738 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
742 recipW
[0] = vRecipW2
;
743 recipW
[1] = vRecipW0
;
744 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
746 AR_END(FEBinTriangles
, 1);
750 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
751 // compute per tri backface
752 uint32_t frontFaceMask
= frontWindingTris
;
753 uint32_t *pPrimID
= (uint32_t *)&primID
;
754 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
757 PFN_WORK_FUNC pfnWork
;
758 if (CT::IsConservativeT::value
)
760 // determine which edges of the degenerate tri, if any, are valid to rasterize.
761 // used to call the appropriate templated rasterizer function
762 if (cullZeroAreaMask
> 0)
765 simdscalari x0x1Mask
= _simd_cmpeq_epi32(vXi
[0], vXi
[1]);
766 simdscalari y0y1Mask
= _simd_cmpeq_epi32(vYi
[0], vYi
[1]);
767 uint32_t e0Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask
, y0y1Mask
)));
770 simdscalari x1x2Mask
= _simd_cmpeq_epi32(vXi
[1], vXi
[2]);
771 simdscalari y1y2Mask
= _simd_cmpeq_epi32(vYi
[1], vYi
[2]);
772 uint32_t e1Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask
, y1y2Mask
)));
775 // if v0 == v1 & v1 == v2, v0 == v2
776 uint32_t e2Mask
= e0Mask
& e1Mask
;
777 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
779 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
780 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
781 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
782 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
783 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
784 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
785 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
787 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
791 edgeEnable
= 0x00FFFFFF;
796 // degenerate triangles won't be sent to rasterizer; just enable all edges
797 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
798 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
803 goto endBinTriangles
;
806 // Calc bounding box of triangles
808 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
810 // determine if triangle falls between pixel centers and discard
811 // only discard for non-MSAA case and when conservative rast is disabled
812 // (xmin + 127) & ~255
813 // (xmax + 128) & ~255
814 if((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
815 (!CT::IsConservativeT::value
))
817 origTriMask
= triMask
;
821 simdscalari xmin
= _simd_add_epi32(bbox
.xmin
, _simd_set1_epi32(127));
822 xmin
= _simd_and_si(xmin
, _simd_set1_epi32(~255));
823 simdscalari xmax
= _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(128));
824 xmax
= _simd_and_si(xmax
, _simd_set1_epi32(~255));
826 simdscalari vMaskH
= _simd_cmpeq_epi32(xmin
, xmax
);
828 simdscalari ymin
= _simd_add_epi32(bbox
.ymin
, _simd_set1_epi32(127));
829 ymin
= _simd_and_si(ymin
, _simd_set1_epi32(~255));
830 simdscalari ymax
= _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(128));
831 ymax
= _simd_and_si(ymax
, _simd_set1_epi32(~255));
833 simdscalari vMaskV
= _simd_cmpeq_epi32(ymin
, ymax
);
834 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
835 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
838 triMask
&= ~cullCenterMask
;
840 if (origTriMask
^ triMask
)
842 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
846 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
847 // Gather the AOS effective scissor rects based on the per-prim VP index.
848 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
849 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
850 if (state
.gsState
.emitsViewportArrayIndex
)
852 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
853 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
855 else // broadcast fast path for non-VPAI case.
857 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
858 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
859 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
860 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
863 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
864 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
865 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
866 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
868 if (CT::IsConservativeT::value
)
870 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
871 // some area. Bump the xmax/ymax edges out
872 simdscalari topEqualsBottom
= _simd_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
873 bbox
.ymax
= _simd_blendv_epi32(bbox
.ymax
, _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(1)), topEqualsBottom
);
874 simdscalari leftEqualsRight
= _simd_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
875 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(1)), leftEqualsRight
);
878 // Cull tris completely outside scissor
880 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
881 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
882 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
883 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
884 triMask
= triMask
& ~maskOutsideScissor
;
889 goto endBinTriangles
;
892 // Convert triangle bbox to macrotile units.
893 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
894 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
895 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
896 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
898 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
899 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
900 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
901 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
902 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
904 // transpose verts needed for backend
905 /// @todo modify BE to take non-transformed verts
906 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
907 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
908 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
909 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
910 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
912 // store render target array index
913 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
914 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
917 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
919 vRtaii
= _simd_castps_si(vRtai
[0].x
);
920 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
924 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
929 // scan remaining valid triangles and bin each separately
930 while (_BitScanForward(&triIndex
, triMask
))
932 uint32_t linkageCount
= state
.backendState
.numAttributes
;
933 uint32_t numScalarAttribs
= linkageCount
* 4;
939 if (CT::IsConservativeT::value
)
941 // only rasterize valid edges if we have a degenerate primitive
942 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
943 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
944 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
946 // Degenerate triangles are required to be constant interpolated
947 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
951 isDegenerate
= false;
952 work
.pfnWork
= pfnWork
;
955 // Select attribute processor
956 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
957 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
959 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
961 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
962 desc
.triFlags
.primID
= pPrimID
[triIndex
];
963 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
964 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
966 auto pArena
= pDC
->pArena
;
967 SWR_ASSERT(pArena
!= nullptr);
969 // store active attribs
970 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
971 desc
.pAttribs
= pAttribs
;
972 desc
.numAttribs
= linkageCount
;
973 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
975 // store triangle vertex data
976 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
978 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
979 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
980 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
981 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
983 // store user clip distances
984 if (rastState
.clipDistanceMask
)
986 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
987 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
988 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
991 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
993 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
995 #if KNOB_ENABLE_TOSS_POINTS
996 if (!KNOB_TOSS_SETUP_TRIS
)
999 pTileMgr
->enqueue(x
, y
, &work
);
1003 triMask
&= ~(1 << triIndex
);
1006 AR_END(FEBinTriangles
, 1);
1009 #if USE_SIMD16_FRONTEND
1010 template <typename CT
>
1011 void SIMDAPI
BinTriangles_simd16(
1015 simd16vector tri
[3],
1017 simd16scalari primID
,
1018 simd16scalari viewportIdx
)
1020 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1022 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
1024 const API_STATE
& state
= GetApiState(pDC
);
1025 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1026 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1027 const SWR_GS_STATE
& gsState
= state
.gsState
;
1029 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1031 simd16scalar vRecipW0
= _simd16_set1_ps(1.0f
);
1032 simd16scalar vRecipW1
= _simd16_set1_ps(1.0f
);
1033 simd16scalar vRecipW2
= _simd16_set1_ps(1.0f
);
1035 if (feState
.vpTransformDisable
)
1037 // RHW is passed in directly when VP transform is disabled
1038 vRecipW0
= tri
[0].v
[3];
1039 vRecipW1
= tri
[1].v
[3];
1040 vRecipW2
= tri
[2].v
[3];
1044 // Perspective divide
1045 vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[0].w
);
1046 vRecipW1
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[1].w
);
1047 vRecipW2
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[2].w
);
1049 tri
[0].v
[0] = _simd16_mul_ps(tri
[0].v
[0], vRecipW0
);
1050 tri
[1].v
[0] = _simd16_mul_ps(tri
[1].v
[0], vRecipW1
);
1051 tri
[2].v
[0] = _simd16_mul_ps(tri
[2].v
[0], vRecipW2
);
1053 tri
[0].v
[1] = _simd16_mul_ps(tri
[0].v
[1], vRecipW0
);
1054 tri
[1].v
[1] = _simd16_mul_ps(tri
[1].v
[1], vRecipW1
);
1055 tri
[2].v
[1] = _simd16_mul_ps(tri
[2].v
[1], vRecipW2
);
1057 tri
[0].v
[2] = _simd16_mul_ps(tri
[0].v
[2], vRecipW0
);
1058 tri
[1].v
[2] = _simd16_mul_ps(tri
[1].v
[2], vRecipW1
);
1059 tri
[2].v
[2] = _simd16_mul_ps(tri
[2].v
[2], vRecipW2
);
1061 // Viewport transform to screen space coords
1062 if (state
.gsState
.emitsViewportArrayIndex
)
1064 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
1068 viewportTransform
<3>(tri
, state
.vpMatrices
);
1072 // Adjust for pixel center location
1073 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
1075 tri
[0].x
= _simd16_add_ps(tri
[0].x
, offset
);
1076 tri
[0].y
= _simd16_add_ps(tri
[0].y
, offset
);
1078 tri
[1].x
= _simd16_add_ps(tri
[1].x
, offset
);
1079 tri
[1].y
= _simd16_add_ps(tri
[1].y
, offset
);
1081 tri
[2].x
= _simd16_add_ps(tri
[2].x
, offset
);
1082 tri
[2].y
= _simd16_add_ps(tri
[2].y
, offset
);
1084 simd16scalari vXi
[3], vYi
[3];
1086 // Set vXi, vYi to required fixed point precision
1087 FPToFixedPoint(tri
, vXi
, vYi
);
1090 simd16scalari vAi
[3], vBi
[3];
1091 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
1094 simd16scalari vDet
[2];
1095 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
1098 uint32_t maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[0], _simd16_setzero_si())));
1099 uint32_t maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[1], _simd16_setzero_si())));
1101 uint32_t cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
1103 // don't cull degenerate triangles if we're conservatively rasterizing
1104 uint32_t origTriMask
= triMask
;
1105 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
1107 triMask
&= ~cullZeroAreaMask
;
1110 // determine front winding tris
1113 // 0 area triangles are marked as backfacing regardless of winding order,
1114 // which is required behavior for conservative rast and wireframe rendering
1115 uint32_t frontWindingTris
;
1116 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
1118 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[0], _simd16_setzero_si())));
1119 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[1], _simd16_setzero_si())));
1123 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[0])));
1124 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[1])));
1126 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
1130 switch ((SWR_CULLMODE
)rastState
.cullMode
)
1132 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
1133 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
1134 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
1135 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1136 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
1137 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
1140 triMask
&= ~cullTris
;
1142 if (origTriMask
^ triMask
)
1144 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1147 // Simple non-conformant wireframe mode, useful for debugging
1148 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
1150 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1151 simd16vector line
[2];
1152 simd16scalar recipW
[2];
1155 recipW
[0] = vRecipW0
;
1156 recipW
[1] = vRecipW1
;
1157 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1161 recipW
[0] = vRecipW1
;
1162 recipW
[1] = vRecipW2
;
1163 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1167 recipW
[0] = vRecipW2
;
1168 recipW
[1] = vRecipW0
;
1169 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1171 AR_END(FEBinTriangles
, 1);
1175 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1176 // compute per tri backface
1177 uint32_t frontFaceMask
= frontWindingTris
;
1178 uint32_t *pPrimID
= (uint32_t *)&primID
;
1179 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1182 uint32_t edgeEnable
;
1183 PFN_WORK_FUNC pfnWork
;
1184 if (CT::IsConservativeT::value
)
1186 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1187 // used to call the appropriate templated rasterizer function
1188 if (cullZeroAreaMask
> 0)
1191 const simd16scalari x0x1Mask
= _simd16_cmpeq_epi32(vXi
[0], vXi
[1]);
1192 const simd16scalari y0y1Mask
= _simd16_cmpeq_epi32(vYi
[0], vYi
[1]);
1194 uint32_t e0Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask
, y0y1Mask
)));
1197 const simd16scalari x1x2Mask
= _simd16_cmpeq_epi32(vXi
[1], vXi
[2]);
1198 const simd16scalari y1y2Mask
= _simd16_cmpeq_epi32(vYi
[1], vYi
[2]);
1200 uint32_t e1Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask
, y1y2Mask
)));
1203 // if v0 == v1 & v1 == v2, v0 == v2
1204 uint32_t e2Mask
= e0Mask
& e1Mask
;
1205 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
1207 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1208 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1209 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
1211 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1212 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
1214 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1215 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
1217 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
1221 edgeEnable
= 0x00FFFFFF;
1226 // degenerate triangles won't be sent to rasterizer; just enable all edges
1227 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1228 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
1233 goto endBinTriangles
;
1236 // Calc bounding box of triangles
1238 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
1240 // determine if triangle falls between pixel centers and discard
1241 // only discard for non-MSAA case and when conservative rast is disabled
1242 // (xmin + 127) & ~255
1243 // (xmax + 128) & ~255
1244 if ((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
1245 (!CT::IsConservativeT::value
))
1247 origTriMask
= triMask
;
1252 simd16scalari xmin
= _simd16_add_epi32(bbox
.xmin
, _simd16_set1_epi32(127));
1253 xmin
= _simd16_and_si(xmin
, _simd16_set1_epi32(~255));
1254 simd16scalari xmax
= _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(128));
1255 xmax
= _simd16_and_si(xmax
, _simd16_set1_epi32(~255));
1257 simd16scalari vMaskH
= _simd16_cmpeq_epi32(xmin
, xmax
);
1259 simd16scalari ymin
= _simd16_add_epi32(bbox
.ymin
, _simd16_set1_epi32(127));
1260 ymin
= _simd16_and_si(ymin
, _simd16_set1_epi32(~255));
1261 simd16scalari ymax
= _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(128));
1262 ymax
= _simd16_and_si(ymax
, _simd16_set1_epi32(~255));
1264 simd16scalari vMaskV
= _simd16_cmpeq_epi32(ymin
, ymax
);
1266 vMaskV
= _simd16_or_si(vMaskH
, vMaskV
);
1267 cullCenterMask
= _simd16_movemask_ps(_simd16_castsi_ps(vMaskV
));
1270 triMask
&= ~cullCenterMask
;
1272 if (origTriMask
^ triMask
)
1274 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1278 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1279 // Gather the AOS effective scissor rects based on the per-prim VP index.
1280 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1281 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1283 if (state
.gsState
.emitsViewportArrayIndex
)
1285 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1286 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1288 else // broadcast fast path for non-VPAI case.
1290 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1291 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1292 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1293 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1296 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
1297 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
1298 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
1299 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
1301 if (CT::IsConservativeT::value
)
1303 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1304 // some area. Bump the xmax/ymax edges out
1305 simd16scalari topEqualsBottom
= _simd16_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
1306 bbox
.ymax
= _simd16_blendv_epi32(bbox
.ymax
, _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), topEqualsBottom
);
1307 simd16scalari leftEqualsRight
= _simd16_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
1308 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), leftEqualsRight
);
1311 // Cull tris completely outside scissor
1313 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1314 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1315 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1316 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
1317 triMask
= triMask
& ~maskOutsideScissor
;
1322 goto endBinTriangles
;
1325 // Convert triangle bbox to macrotile units.
1326 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1327 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1328 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1329 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1331 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
1333 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
1334 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
1335 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
1336 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
1338 // transpose verts needed for backend
1339 /// @todo modify BE to take non-transformed verts
1340 __m128 vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1341 __m128 vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1342 __m128 vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1343 __m128 vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1345 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(tri
[0].x
, 0), _simd16_extract_ps(tri
[1].x
, 0), _simd16_extract_ps(tri
[2].x
, 0));
1346 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(tri
[0].y
, 0), _simd16_extract_ps(tri
[1].y
, 0), _simd16_extract_ps(tri
[2].y
, 0));
1347 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(tri
[0].z
, 0), _simd16_extract_ps(tri
[1].z
, 0), _simd16_extract_ps(tri
[2].z
, 0));
1348 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), _simd16_extract_ps(vRecipW2
, 0));
1350 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(tri
[0].x
, 1), _simd16_extract_ps(tri
[1].x
, 1), _simd16_extract_ps(tri
[2].x
, 1));
1351 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(tri
[0].y
, 1), _simd16_extract_ps(tri
[1].y
, 1), _simd16_extract_ps(tri
[2].y
, 1));
1352 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(tri
[0].z
, 1), _simd16_extract_ps(tri
[1].z
, 1), _simd16_extract_ps(tri
[2].z
, 1));
1353 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), _simd16_extract_ps(vRecipW2
, 1));
1355 // store render target array index
1356 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1357 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1359 simd16vector vRtai
[3];
1360 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
1361 simd16scalari vRtaii
;
1362 vRtaii
= _simd16_castps_si(vRtai
[0].x
);
1363 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1367 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1373 // scan remaining valid triangles and bin each separately
1374 while (_BitScanForward(&triIndex
, triMask
))
1376 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1377 uint32_t numScalarAttribs
= linkageCount
* 4;
1383 if (CT::IsConservativeT::value
)
1385 // only rasterize valid edges if we have a degenerate primitive
1386 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
1387 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1388 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
1390 // Degenerate triangles are required to be constant interpolated
1391 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
1395 isDegenerate
= false;
1396 work
.pfnWork
= pfnWork
;
1399 // Select attribute processor
1400 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
1401 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
1403 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1405 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
1406 desc
.triFlags
.primID
= pPrimID
[triIndex
];
1407 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
1408 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
1410 auto pArena
= pDC
->pArena
;
1411 SWR_ASSERT(pArena
!= nullptr);
1413 // store active attribs
1414 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1415 desc
.pAttribs
= pAttribs
;
1416 desc
.numAttribs
= linkageCount
;
1417 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
1419 // store triangle vertex data
1420 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1423 const uint32_t i
= triIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
1424 const uint32_t j
= triIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
1426 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
1427 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
1428 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
1429 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
1432 // store user clip distances
1433 if (rastState
.clipDistanceMask
)
1435 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1436 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1437 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1440 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
1442 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
1444 #if KNOB_ENABLE_TOSS_POINTS
1445 if (!KNOB_TOSS_SETUP_TRIS
)
1448 pTileMgr
->enqueue(x
, y
, &work
);
1453 triMask
&= ~(1 << triIndex
);
1456 AR_END(FEBinTriangles
, 1);
1460 struct FEBinTrianglesChooser
1462 typedef PFN_PROCESS_PRIMS FuncType
;
1464 template <typename
... ArgsB
>
1465 static FuncType
GetFunc()
1467 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
1471 // Selector for correct templated BinTrinagles function
1472 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
1474 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
1477 #if USE_SIMD16_FRONTEND
1478 struct FEBinTrianglesChooser_simd16
1480 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType
;
1482 template <typename
... ArgsB
>
1483 static FuncType
GetFunc()
1485 return BinTriangles_simd16
<ConservativeRastFETraits
<ArgsB
...>>;
1489 // Selector for correct templated BinTrinagles function
1490 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
)
1492 return TemplateArgUnroller
<FEBinTrianglesChooser_simd16
>::GetFunc(IsConservative
);
1497 //////////////////////////////////////////////////////////////////////////
1498 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1499 /// @param pDC - pointer to draw context.
1500 /// @param pa - The primitive assembly object.
1501 /// @param workerId - thread's worker id. Even thread has a unique id.
1502 /// @param tri - Contains point position data for SIMDs worth of points.
1503 /// @param primID - Primitive ID for each point.
1511 simdscalari viewportIdx
)
1513 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1515 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1517 simdvector
& primVerts
= prim
[0];
1519 const API_STATE
& state
= GetApiState(pDC
);
1520 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1521 const SWR_GS_STATE
& gsState
= state
.gsState
;
1522 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1523 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1525 // Select attribute processor
1526 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1527 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1529 if (!feState
.vpTransformDisable
)
1531 // perspective divide
1532 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
1533 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
1534 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
1535 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
1537 // viewport transform to screen coords
1538 if (state
.gsState
.emitsViewportArrayIndex
)
1540 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
1544 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
1548 // adjust for pixel center location
1549 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1550 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
1551 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
1553 // convert to fixed point
1554 simdscalari vXi
, vYi
;
1555 vXi
= fpToFixedPointVertical(primVerts
.x
);
1556 vYi
= fpToFixedPointVertical(primVerts
.y
);
1558 if (CanUseSimplePoints(pDC
))
1560 // adjust for ymin-xmin rule
1561 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
1562 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
1564 // cull points off the ymin-xmin edge of the viewport
1565 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
1566 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
1568 // compute macro tile coordinates
1569 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1570 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1572 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
1573 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
1574 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
1576 // compute raster tile coordinates
1577 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1578 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1580 // compute raster tile relative x,y for coverage mask
1581 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1582 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1584 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1585 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1587 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
1588 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
1589 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
1590 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
1592 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
1593 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
1594 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
1595 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
1597 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
1598 _simd_store_ps((float*)aZ
, primVerts
.z
);
1600 // store render target array index
1601 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1602 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1605 pa
.Assemble(VERTEX_RTAI_SLOT
, &vRtai
);
1606 simdscalari vRtaii
= _simd_castps_si(vRtai
.x
);
1607 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1611 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1614 uint32_t *pPrimID
= (uint32_t *)&primID
;
1615 DWORD primIndex
= 0;
1617 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1619 // scan remaining valid triangles and bin each separately
1620 while (_BitScanForward(&primIndex
, primMask
))
1622 uint32_t linkageCount
= backendState
.numAttributes
;
1623 uint32_t numScalarAttribs
= linkageCount
* 4;
1628 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1630 // points are always front facing
1631 desc
.triFlags
.frontFacing
= 1;
1632 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1633 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1634 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1636 work
.pfnWork
= RasterizeSimplePoint
;
1638 auto pArena
= pDC
->pArena
;
1639 SWR_ASSERT(pArena
!= nullptr);
1642 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1643 desc
.pAttribs
= pAttribs
;
1644 desc
.numAttribs
= linkageCount
;
1646 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1648 // store raster tile aligned x, y, perspective correct z
1649 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1650 desc
.pTriBuffer
= pTriBuffer
;
1651 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1652 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1653 *pTriBuffer
= aZ
[primIndex
];
1655 uint32_t tX
= aTileRelativeX
[primIndex
];
1656 uint32_t tY
= aTileRelativeY
[primIndex
];
1658 // pack the relative x,y into the coverageMask, the rasterizer will
1659 // generate the true coverage mask from it
1660 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1663 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1664 #if KNOB_ENABLE_TOSS_POINTS
1665 if (!KNOB_TOSS_SETUP_TRIS
)
1668 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1670 primMask
&= ~(1 << primIndex
);
1675 // non simple points need to be potentially binned to multiple macro tiles
1676 simdscalar vPointSize
;
1677 if (rastState
.pointParam
)
1680 pa
.Assemble(VERTEX_POINT_SIZE_SLOT
, size
);
1681 vPointSize
= size
[0].x
;
1685 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
1688 // bloat point to bbox
1690 bbox
.xmin
= bbox
.xmax
= vXi
;
1691 bbox
.ymin
= bbox
.ymax
= vYi
;
1693 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
1694 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1695 bbox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1696 bbox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
1697 bbox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1698 bbox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
1700 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1701 // Gather the AOS effective scissor rects based on the per-prim VP index.
1702 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1703 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1704 if (state
.gsState
.emitsViewportArrayIndex
)
1706 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1707 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1709 else // broadcast fast path for non-VPAI case.
1711 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1712 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1713 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1714 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1717 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
1718 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
1719 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
1720 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
1722 // Cull bloated points completely outside scissor
1723 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1724 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1725 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1726 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1727 primMask
= primMask
& ~maskOutsideScissor
;
1729 // Convert bbox to macrotile units.
1730 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1731 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1732 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1733 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1735 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1736 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
1737 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
1738 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
1739 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
1741 // store render target array index
1742 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1743 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1745 simdvector vRtai
[2];
1746 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
1747 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
1748 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1752 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1755 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
1756 _simd_store_ps((float*)aPointSize
, vPointSize
);
1758 uint32_t *pPrimID
= (uint32_t *)&primID
;
1760 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
1761 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
1762 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
1764 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
1765 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
1766 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
1768 // scan remaining valid prims and bin each separately
1769 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1771 while (_BitScanForward(&primIndex
, primMask
))
1773 uint32_t linkageCount
= backendState
.numAttributes
;
1774 uint32_t numScalarAttribs
= linkageCount
* 4;
1779 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1781 desc
.triFlags
.frontFacing
= 1;
1782 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1783 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1784 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1785 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1787 work
.pfnWork
= RasterizeTriPoint
;
1789 auto pArena
= pDC
->pArena
;
1790 SWR_ASSERT(pArena
!= nullptr);
1792 // store active attribs
1793 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1794 desc
.numAttribs
= linkageCount
;
1795 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1797 // store point vertex data
1798 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1799 desc
.pTriBuffer
= pTriBuffer
;
1800 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
1801 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
1802 *pTriBuffer
= aPrimVertsZ
[primIndex
];
1804 // store user clip distances
1805 if (rastState
.clipDistanceMask
)
1807 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1808 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1811 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
1812 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
1813 desc
.pUserClipBuffer
[3*i
+ 0] = 0.0f
;
1814 desc
.pUserClipBuffer
[3*i
+ 1] = 0.0f
;
1815 desc
.pUserClipBuffer
[3*i
+ 2] = dists
[i
];
1819 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1820 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1822 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1824 #if KNOB_ENABLE_TOSS_POINTS
1825 if (!KNOB_TOSS_SETUP_TRIS
)
1828 pTileMgr
->enqueue(x
, y
, &work
);
1833 primMask
&= ~(1 << primIndex
);
1837 AR_END(FEBinPoints
, 1);
1840 #if USE_SIMD16_FRONTEND
1841 void SIMDAPI
BinPoints_simd16(
1845 simd16vector prim
[3],
1847 simd16scalari primID
,
1848 simd16scalari viewportIdx
)
1850 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1852 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1854 simd16vector
& primVerts
= prim
[0];
1856 const API_STATE
& state
= GetApiState(pDC
);
1857 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1858 const SWR_GS_STATE
& gsState
= state
.gsState
;
1859 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1860 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1862 // Select attribute processor
1863 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1864 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1866 if (!feState
.vpTransformDisable
)
1868 // perspective divide
1869 simd16scalar vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), primVerts
.w
);
1871 primVerts
.x
= _simd16_mul_ps(primVerts
.x
, vRecipW0
);
1872 primVerts
.y
= _simd16_mul_ps(primVerts
.y
, vRecipW0
);
1873 primVerts
.z
= _simd16_mul_ps(primVerts
.z
, vRecipW0
);
1875 // viewport transform to screen coords
1876 if (state
.gsState
.emitsViewportArrayIndex
)
1878 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
1882 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
1886 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
1888 primVerts
.x
= _simd16_add_ps(primVerts
.x
, offset
);
1889 primVerts
.y
= _simd16_add_ps(primVerts
.y
, offset
);
1891 // convert to fixed point
1892 simd16scalari vXi
, vYi
;
1894 vXi
= fpToFixedPointVertical(primVerts
.x
);
1895 vYi
= fpToFixedPointVertical(primVerts
.y
);
1897 if (CanUseSimplePoints(pDC
))
1899 // adjust for ymin-xmin rule
1900 vXi
= _simd16_sub_epi32(vXi
, _simd16_set1_epi32(1));
1901 vYi
= _simd16_sub_epi32(vYi
, _simd16_set1_epi32(1));
1903 // cull points off the ymin-xmin edge of the viewport
1904 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi
));
1905 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi
));
1907 // compute macro tile coordinates
1908 simd16scalari macroX
= _simd16_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1909 simd16scalari macroY
= _simd16_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1911 OSALIGNSIMD16(uint32_t) aMacroX
[KNOB_SIMD16_WIDTH
], aMacroY
[KNOB_SIMD16_WIDTH
];
1913 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroX
), macroX
);
1914 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroY
), macroY
);
1916 // compute raster tile coordinates
1917 simd16scalari rasterX
= _simd16_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1918 simd16scalari rasterY
= _simd16_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1920 // compute raster tile relative x,y for coverage mask
1921 simd16scalari tileAlignedX
= _simd16_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1922 simd16scalari tileAlignedY
= _simd16_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1924 simd16scalari tileRelativeX
= _simd16_sub_epi32(_simd16_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1925 simd16scalari tileRelativeY
= _simd16_sub_epi32(_simd16_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1927 OSALIGNSIMD16(uint32_t) aTileRelativeX
[KNOB_SIMD16_WIDTH
];
1928 OSALIGNSIMD16(uint32_t) aTileRelativeY
[KNOB_SIMD16_WIDTH
];
1930 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeX
), tileRelativeX
);
1931 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeY
), tileRelativeY
);
1933 OSALIGNSIMD16(uint32_t) aTileAlignedX
[KNOB_SIMD16_WIDTH
];
1934 OSALIGNSIMD16(uint32_t) aTileAlignedY
[KNOB_SIMD16_WIDTH
];
1936 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedX
), tileAlignedX
);
1937 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedY
), tileAlignedY
);
1939 OSALIGNSIMD16(float) aZ
[KNOB_SIMD16_WIDTH
];
1940 _simd16_store_ps(reinterpret_cast<float *>(aZ
), primVerts
.z
);
1942 // store render target array index
1943 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1944 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1947 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, &vRtai
);
1948 simd16scalari vRtaii
= _simd16_castps_si(vRtai
.x
);
1949 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1953 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1956 uint32_t *pPrimID
= (uint32_t *)&primID
;
1957 DWORD primIndex
= 0;
1959 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1961 // scan remaining valid triangles and bin each separately
1962 while (_BitScanForward(&primIndex
, primMask
))
1964 uint32_t linkageCount
= backendState
.numAttributes
;
1965 uint32_t numScalarAttribs
= linkageCount
* 4;
1970 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1972 // points are always front facing
1973 desc
.triFlags
.frontFacing
= 1;
1974 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1975 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1976 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1978 work
.pfnWork
= RasterizeSimplePoint
;
1980 auto pArena
= pDC
->pArena
;
1981 SWR_ASSERT(pArena
!= nullptr);
1984 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1985 desc
.pAttribs
= pAttribs
;
1986 desc
.numAttribs
= linkageCount
;
1988 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1990 // store raster tile aligned x, y, perspective correct z
1991 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1992 desc
.pTriBuffer
= pTriBuffer
;
1993 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1994 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1995 *pTriBuffer
= aZ
[primIndex
];
1997 uint32_t tX
= aTileRelativeX
[primIndex
];
1998 uint32_t tY
= aTileRelativeY
[primIndex
];
2000 // pack the relative x,y into the coverageMask, the rasterizer will
2001 // generate the true coverage mask from it
2002 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
2005 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2006 #if KNOB_ENABLE_TOSS_POINTS
2007 if (!KNOB_TOSS_SETUP_TRIS
)
2010 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
2013 primMask
&= ~(1 << primIndex
);
2018 // non simple points need to be potentially binned to multiple macro tiles
2019 simd16scalar vPointSize
;
2021 if (rastState
.pointParam
)
2023 simd16vector size
[3];
2024 pa
.Assemble_simd16(VERTEX_POINT_SIZE_SLOT
, size
);
2025 vPointSize
= size
[0].x
;
2029 vPointSize
= _simd16_set1_ps(rastState
.pointSize
);
2032 // bloat point to bbox
2035 bbox
.xmin
= bbox
.xmax
= vXi
;
2036 bbox
.ymin
= bbox
.ymax
= vYi
;
2038 simd16scalar vHalfWidth
= _simd16_mul_ps(vPointSize
, _simd16_set1_ps(0.5f
));
2039 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2041 bbox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2042 bbox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
2043 bbox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2044 bbox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
2046 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2047 // Gather the AOS effective scissor rects based on the per-prim VP index.
2048 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2049 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2050 if (state
.gsState
.emitsViewportArrayIndex
)
2052 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2053 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2055 else // broadcast fast path for non-VPAI case.
2057 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2058 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2059 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2060 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2063 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
2064 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
2065 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
2066 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
2068 // Cull bloated points completely outside scissor
2069 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2070 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2071 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2072 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
2073 primMask
= primMask
& ~maskOutsideScissor
;
2075 // Convert bbox to macrotile units.
2076 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2077 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2078 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2079 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2081 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
2083 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
2084 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
2085 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
2086 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
2088 // store render target array index
2089 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
2090 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2092 simd16vector vRtai
[2];
2093 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
2094 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0].x
);
2095 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
2099 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
2102 OSALIGNSIMD16(float) aPointSize
[KNOB_SIMD16_WIDTH
];
2103 _simd16_store_ps(reinterpret_cast<float *>(aPointSize
), vPointSize
);
2105 uint32_t *pPrimID
= (uint32_t *)&primID
;
2107 OSALIGNSIMD16(float) aPrimVertsX
[KNOB_SIMD16_WIDTH
];
2108 OSALIGNSIMD16(float) aPrimVertsY
[KNOB_SIMD16_WIDTH
];
2109 OSALIGNSIMD16(float) aPrimVertsZ
[KNOB_SIMD16_WIDTH
];
2111 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX
), primVerts
.x
);
2112 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY
), primVerts
.y
);
2113 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ
), primVerts
.z
);
2115 // scan remaining valid prims and bin each separately
2116 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
2118 while (_BitScanForward(&primIndex
, primMask
))
2120 uint32_t linkageCount
= backendState
.numAttributes
;
2121 uint32_t numScalarAttribs
= linkageCount
* 4;
2126 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2128 desc
.triFlags
.frontFacing
= 1;
2129 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2130 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
2131 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2132 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2134 work
.pfnWork
= RasterizeTriPoint
;
2136 auto pArena
= pDC
->pArena
;
2137 SWR_ASSERT(pArena
!= nullptr);
2139 // store active attribs
2140 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2141 desc
.numAttribs
= linkageCount
;
2142 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2144 // store point vertex data
2145 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2146 desc
.pTriBuffer
= pTriBuffer
;
2147 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
2148 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
2149 *pTriBuffer
= aPrimVertsZ
[primIndex
];
2151 // store user clip distances
2152 if (rastState
.clipDistanceMask
)
2154 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2155 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
2158 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
2159 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
2160 desc
.pUserClipBuffer
[3 * i
+ 0] = 0.0f
;
2161 desc
.pUserClipBuffer
[3 * i
+ 1] = 0.0f
;
2162 desc
.pUserClipBuffer
[3 * i
+ 2] = dists
[i
];
2166 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2167 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2169 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2171 #if KNOB_ENABLE_TOSS_POINTS
2172 if (!KNOB_TOSS_SETUP_TRIS
)
2175 pTileMgr
->enqueue(x
, y
, &work
);
2180 primMask
&= ~(1 << primIndex
);
2184 AR_END(FEBinPoints
, 1);
2188 //////////////////////////////////////////////////////////////////////////
2189 /// @brief Bin SIMD lines to the backend.
2190 /// @param pDC - pointer to draw context.
2191 /// @param pa - The primitive assembly object.
2192 /// @param workerId - thread's worker id. Even thread has a unique id.
2193 /// @param tri - Contains line position data for SIMDs worth of points.
2194 /// @param primID - Primitive ID for each line.
2195 /// @param viewportIdx - Viewport Array Index for each line.
2196 void BinPostSetupLines(
2201 simdscalar recipW
[],
2204 simdscalari viewportIdx
)
2206 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2208 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2210 const API_STATE
& state
= GetApiState(pDC
);
2211 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2212 const SWR_GS_STATE
& gsState
= state
.gsState
;
2214 // Select attribute processor
2215 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2216 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2218 simdscalar
& vRecipW0
= recipW
[0];
2219 simdscalar
& vRecipW1
= recipW
[1];
2221 // convert to fixed point
2222 simdscalari vXi
[2], vYi
[2];
2223 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2224 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2225 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2226 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2228 // compute x-major vs y-major mask
2229 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
2230 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
2231 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
2232 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
2234 // cull zero-length lines
2235 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
2236 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
2238 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
2240 uint32_t *pPrimID
= (uint32_t *)&primID
;
2241 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2243 simdscalar vUnused
= _simd_setzero_ps();
2245 // Calc bounding box of lines
2247 bbox
.xmin
= _simd_min_epi32(vXi
[0], vXi
[1]);
2248 bbox
.xmax
= _simd_max_epi32(vXi
[0], vXi
[1]);
2249 bbox
.ymin
= _simd_min_epi32(vYi
[0], vYi
[1]);
2250 bbox
.ymax
= _simd_max_epi32(vYi
[0], vYi
[1]);
2252 // bloat bbox by line width along minor axis
2253 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
2254 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2256 bloatBox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2257 bloatBox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
2258 bloatBox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2259 bloatBox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
2261 bbox
.xmin
= _simd_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2262 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2263 bbox
.ymin
= _simd_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2264 bbox
.ymax
= _simd_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2266 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2267 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2268 if (state
.gsState
.emitsViewportArrayIndex
)
2270 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2271 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2273 else // broadcast fast path for non-VPAI case.
2275 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2276 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2277 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2278 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2281 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
2282 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
2283 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
2284 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
2286 // Cull prims completely outside scissor
2288 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2289 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2290 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2291 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2292 primMask
= primMask
& ~maskOutsideScissor
;
2300 // Convert triangle bbox to macrotile units.
2301 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2302 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2303 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2304 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2306 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2307 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
2308 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
2309 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
2310 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
2312 // transpose verts needed for backend
2313 /// @todo modify BE to take non-transformed verts
2314 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2315 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
2316 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
2317 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
2318 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
2320 // store render target array index
2321 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2322 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2324 simdvector vRtai
[2];
2325 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2326 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2327 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2331 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2334 // scan remaining valid prims and bin each separately
2336 while (_BitScanForward(&primIndex
, primMask
))
2338 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2339 uint32_t numScalarAttribs
= linkageCount
* 4;
2344 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2346 desc
.triFlags
.frontFacing
= 1;
2347 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2348 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2349 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2350 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2352 work
.pfnWork
= RasterizeLine
;
2354 auto pArena
= pDC
->pArena
;
2355 SWR_ASSERT(pArena
!= nullptr);
2357 // store active attribs
2358 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2359 desc
.numAttribs
= linkageCount
;
2360 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2362 // store line vertex data
2363 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2364 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
2365 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
2366 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
2367 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
2369 // store user clip distances
2370 if (rastState
.clipDistanceMask
)
2372 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2373 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2374 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2377 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2378 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2380 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2382 #if KNOB_ENABLE_TOSS_POINTS
2383 if (!KNOB_TOSS_SETUP_TRIS
)
2386 pTileMgr
->enqueue(x
, y
, &work
);
2391 primMask
&= ~(1 << primIndex
);
2396 AR_END(FEBinLines
, 1);
2399 #if USE_SIMD16_FRONTEND
2400 void BinPostSetupLines_simd16(
2404 simd16vector prim
[],
2405 simd16scalar recipW
[],
2407 simd16scalari primID
,
2408 simd16scalari viewportIdx
)
2410 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2412 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2414 const API_STATE
& state
= GetApiState(pDC
);
2415 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2416 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2417 const SWR_GS_STATE
& gsState
= state
.gsState
;
2419 // Select attribute processor
2420 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2421 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2423 simd16scalar
& vRecipW0
= recipW
[0];
2424 simd16scalar
& vRecipW1
= recipW
[1];
2426 // convert to fixed point
2427 simd16scalari vXi
[2], vYi
[2];
2429 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2430 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2431 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2432 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2434 // compute x-major vs y-major mask
2435 simd16scalari xLength
= _simd16_abs_epi32(_simd16_sub_epi32(vXi
[0], vXi
[1]));
2436 simd16scalari yLength
= _simd16_abs_epi32(_simd16_sub_epi32(vYi
[0], vYi
[1]));
2437 simd16scalar vYmajorMask
= _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength
, xLength
));
2438 uint32_t yMajorMask
= _simd16_movemask_ps(vYmajorMask
);
2440 // cull zero-length lines
2441 simd16scalari vZeroLengthMask
= _simd16_cmpeq_epi32(xLength
, _simd16_setzero_si());
2442 vZeroLengthMask
= _simd16_and_si(vZeroLengthMask
, _simd16_cmpeq_epi32(yLength
, _simd16_setzero_si()));
2444 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask
));
2446 uint32_t *pPrimID
= (uint32_t *)&primID
;
2447 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2449 // Calc bounding box of lines
2451 bbox
.xmin
= _simd16_min_epi32(vXi
[0], vXi
[1]);
2452 bbox
.xmax
= _simd16_max_epi32(vXi
[0], vXi
[1]);
2453 bbox
.ymin
= _simd16_min_epi32(vYi
[0], vYi
[1]);
2454 bbox
.ymax
= _simd16_max_epi32(vYi
[0], vYi
[1]);
2456 // bloat bbox by line width along minor axis
2457 simd16scalar vHalfWidth
= _simd16_set1_ps(rastState
.lineWidth
/ 2.0f
);
2458 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2460 simd16BBox bloatBox
;
2462 bloatBox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2463 bloatBox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
2464 bloatBox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2465 bloatBox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
2467 bbox
.xmin
= _simd16_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2468 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2469 bbox
.ymin
= _simd16_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2470 bbox
.ymax
= _simd16_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2472 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2473 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2475 if (state
.gsState
.emitsViewportArrayIndex
)
2477 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2478 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2480 else // broadcast fast path for non-VPAI case.
2482 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2483 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2484 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2485 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2488 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
2489 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
2490 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
2491 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
2493 // Cull prims completely outside scissor
2495 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2496 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2497 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2498 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
2499 primMask
= primMask
& ~maskOutsideScissor
;
2502 const simdscalar unused
= _simd_setzero_ps();
2509 // Convert triangle bbox to macrotile units.
2510 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2511 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2512 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2513 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2515 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
2517 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
2518 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
2519 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
2520 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
2522 // transpose verts needed for backend
2523 /// @todo modify BE to take non-transformed verts
2524 __m128 vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2525 __m128 vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2526 __m128 vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2527 __m128 vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2529 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(prim
[0].x
, 0), _simd16_extract_ps(prim
[1].x
, 0), unused
);
2530 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(prim
[0].y
, 0), _simd16_extract_ps(prim
[1].y
, 0), unused
);
2531 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(prim
[0].z
, 0), _simd16_extract_ps(prim
[1].z
, 0), unused
);
2532 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), unused
);
2534 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(prim
[0].x
, 1), _simd16_extract_ps(prim
[1].x
, 1), unused
);
2535 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(prim
[0].y
, 1), _simd16_extract_ps(prim
[1].y
, 1), unused
);
2536 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(prim
[0].z
, 1), _simd16_extract_ps(prim
[1].z
, 1), unused
);
2537 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), unused
);
2539 // store render target array index
2540 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
2541 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2543 simd16vector vRtai
[2];
2544 pa
.Assemble_simd16(VERTEX_RTAI_SLOT
, vRtai
);
2545 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0].x
);
2546 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
2550 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
2553 // scan remaining valid prims and bin each separately
2555 while (_BitScanForward(&primIndex
, primMask
))
2557 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2558 uint32_t numScalarAttribs
= linkageCount
* 4;
2563 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2565 desc
.triFlags
.frontFacing
= 1;
2566 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2567 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2568 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2569 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2571 work
.pfnWork
= RasterizeLine
;
2573 auto pArena
= pDC
->pArena
;
2574 SWR_ASSERT(pArena
!= nullptr);
2576 // store active attribs
2577 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2578 desc
.numAttribs
= linkageCount
;
2579 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2581 // store line vertex data
2582 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2585 const uint32_t i
= primIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
2586 const uint32_t j
= primIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
2588 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
2589 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
2590 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
2591 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
2594 // store user clip distances
2595 if (rastState
.clipDistanceMask
)
2597 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2598 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2599 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2602 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2603 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2605 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2607 #if KNOB_ENABLE_TOSS_POINTS
2608 if (!KNOB_TOSS_SETUP_TRIS
)
2611 pTileMgr
->enqueue(x
, y
, &work
);
2616 primMask
&= ~(1 << primIndex
);
2621 AR_END(FEBinLines
, 1);
2625 //////////////////////////////////////////////////////////////////////////
2626 /// @brief Bin SIMD lines to the backend.
2627 /// @param pDC - pointer to draw context.
2628 /// @param pa - The primitive assembly object.
2629 /// @param workerId - thread's worker id. Even thread has a unique id.
2630 /// @param tri - Contains line position data for SIMDs worth of points.
2631 /// @param primID - Primitive ID for each line.
2632 /// @param viewportIdx - Viewport Array Index for each line.
2640 simdscalari viewportIdx
)
2642 const API_STATE
& state
= GetApiState(pDC
);
2643 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2644 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2646 simdscalar vRecipW
[2] = { _simd_set1_ps(1.0f
), _simd_set1_ps(1.0f
) };
2648 if (!feState
.vpTransformDisable
)
2650 // perspective divide
2651 vRecipW
[0] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
2652 vRecipW
[1] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
2654 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2655 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2657 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2658 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2660 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2661 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2663 // viewport transform to screen coords
2664 if (state
.gsState
.emitsViewportArrayIndex
)
2666 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2670 viewportTransform
<2>(prim
, state
.vpMatrices
);
2674 // adjust for pixel center location
2675 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2676 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
2677 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
2679 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
2680 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);
2693 #if USE_SIMD16_FRONTEND
2694 void SIMDAPI
BinLines_simd16(
2698 simd16vector prim
[3],
2700 simd16scalari primID
,
2701 simd16scalari viewportIdx
)
2703 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2705 const API_STATE
& state
= GetApiState(pDC
);
2706 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2707 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2708 const SWR_GS_STATE
& gsState
= state
.gsState
;
2710 // Select attribute processor
2711 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2712 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2714 simd16scalar vRecipW
[2] = { _simd16_set1_ps(1.0f
), _simd16_set1_ps(1.0f
) };
2716 if (!feState
.vpTransformDisable
)
2718 // perspective divide
2719 vRecipW
[0] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[0].w
);
2720 vRecipW
[1] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[1].w
);
2722 prim
[0].v
[0] = _simd16_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2723 prim
[1].v
[0] = _simd16_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2725 prim
[0].v
[1] = _simd16_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2726 prim
[1].v
[1] = _simd16_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2728 prim
[0].v
[2] = _simd16_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2729 prim
[1].v
[2] = _simd16_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2731 // viewport transform to screen coords
2732 if (state
.gsState
.emitsViewportArrayIndex
)
2734 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2738 viewportTransform
<2>(prim
, state
.vpMatrices
);
2742 // adjust for pixel center location
2743 simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
2745 prim
[0].x
= _simd16_add_ps(prim
[0].x
, offset
);
2746 prim
[0].y
= _simd16_add_ps(prim
[0].y
, offset
);
2748 prim
[1].x
= _simd16_add_ps(prim
[1].x
, offset
);
2749 prim
[1].y
= _simd16_add_ps(prim
[1].y
, offset
);
2751 BinPostSetupLines_simd16(