1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for the macrotile binner
27 ******************************************************************************/
31 #include "conservativeRast.h"
33 #include "rasterizer.h"
34 #include "rdtsc_core.h"
38 void BinPostSetupLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], simdscalar vRecipW
[2], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
40 //////////////////////////////////////////////////////////////////////////
41 /// @brief Offsets added to post-viewport vertex positions based on
43 static const simdscalar g_pixelOffsets
[SWR_PIXEL_LOCATION_UL
+ 1] =
45 _simd_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
46 _simd_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
49 //////////////////////////////////////////////////////////////////////////
50 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
51 /// Point precision from FP32.
52 template <typename PT
= FixedPointTraits
<Fixed_16_8
>>
53 INLINE simdscalari
fpToFixedPointVertical(const simdscalar vIn
)
55 simdscalar vFixed
= _simd_mul_ps(vIn
, _simd_set1_ps(PT::ScaleT::value
));
56 return _simd_cvtps_epi32(vFixed
);
59 //////////////////////////////////////////////////////////////////////////
60 /// @brief Helper function to set the X,Y coords of a triangle to the
61 /// requested Fixed Point precision from FP32.
62 /// @param tri: simdvector[3] of FP triangle verts
63 /// @param vXi: fixed point X coords of tri verts
64 /// @param vYi: fixed point Y coords of tri verts
65 INLINE
static void FPToFixedPoint(const simdvector
* const tri
, simdscalari(&vXi
)[3], simdscalari(&vYi
)[3])
67 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
68 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
69 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
70 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
71 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
72 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
75 //////////////////////////////////////////////////////////////////////////
76 /// @brief Calculate bounding box for current triangle
77 /// @tparam CT: ConservativeRastFETraits type
78 /// @param vX: fixed point X position for triangle verts
79 /// @param vY: fixed point Y position for triangle verts
80 /// @param bbox: fixed point bbox
81 /// *Note*: expects vX, vY to be in the correct precision for the type
82 /// of rasterization. This avoids unnecessary FP->fixed conversions.
83 template <typename CT
>
84 INLINE
void calcBoundingBoxIntVertical(const simdvector
* const tri
, simdscalari(&vX
)[3], simdscalari(&vY
)[3], simdBBox
&bbox
)
86 simdscalari vMinX
= vX
[0];
87 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
88 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
90 simdscalari vMaxX
= vX
[0];
91 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
92 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
94 simdscalari vMinY
= vY
[0];
95 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
96 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
98 simdscalari vMaxY
= vY
[0];
99 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
100 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
110 /// Offsets BBox for conservative rast
112 INLINE
void calcBoundingBoxIntVertical
<FEConservativeRastT
>(const simdvector
* const tri
, simdscalari(&vX
)[3], simdscalari(&vY
)[3], simdBBox
&bbox
)
114 // FE conservative rast traits
115 typedef FEConservativeRastT CT
;
117 simdscalari vMinX
= vX
[0];
118 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
119 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
121 simdscalari vMaxX
= vX
[0];
122 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
123 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
125 simdscalari vMinY
= vY
[0];
126 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
127 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
129 simdscalari vMaxY
= vY
[0];
130 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
131 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
133 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
134 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
135 bbox
.xmin
= _simd_sub_epi32(vMinX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
136 bbox
.xmax
= _simd_add_epi32(vMaxX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
137 bbox
.ymin
= _simd_sub_epi32(vMinY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
138 bbox
.ymax
= _simd_add_epi32(vMaxY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
141 //////////////////////////////////////////////////////////////////////////
142 /// @brief Processes attributes for the backend based on linkage mask and
143 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
144 /// @param pDC - Draw context
145 /// @param pa - Primitive Assembly state
146 /// @param linkageMask - Specifies which VS outputs are routed to PS.
147 /// @param pLinkageMap - maps VS attribute slot to PS slot
148 /// @param triIndex - Triangle to process attributes for
149 /// @param pBuffer - Output result
150 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
151 INLINE
void ProcessAttributes(
158 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
159 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
160 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
161 LONG constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
162 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
163 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
165 static const float constTable
[3][4] = {
166 { 0.0f
, 0.0f
, 0.0f
, 0.0f
},
167 { 0.0f
, 0.0f
, 0.0f
, 1.0f
},
168 { 1.0f
, 1.0f
, 1.0f
, 1.0f
}
171 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
174 if (IsSwizzledT::value
)
176 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
177 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ attribSwizzle
.sourceAttrib
;
182 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ i
;
185 __m128 attrib
[3]; // triangle attribs (always 4 wide)
186 float* pAttribStart
= pBuffer
;
188 if (HasConstantInterpT::value
|| IsDegenerate::value
)
190 if (_bittest(&constantInterpMask
, i
))
193 uint32_t adjustedTriIndex
;
194 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
195 static const int32_t quadProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
196 static const uint32_t quadProvokingVertex
[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
197 static const int32_t qstripProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
198 static const uint32_t qstripProvokingVertex
[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
202 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
203 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
206 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
207 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
209 case TOP_TRIANGLE_STRIP
:
210 adjustedTriIndex
= triIndex
;
212 ? tristripProvokingVertex
[provokingVertex
]
216 adjustedTriIndex
= triIndex
;
217 vid
= provokingVertex
;
221 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
223 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
225 _mm_store_ps(pBuffer
, attrib
[vid
]);
231 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
233 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
235 _mm_store_ps(pBuffer
, attrib
[i
]);
242 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
244 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
246 _mm_store_ps(pBuffer
, attrib
[i
]);
251 // pad out the attrib buffer to 3 verts to ensure the triangle
252 // interpolation code in the pixel shader works correctly for the
253 // 3 topologies - point, line, tri. This effectively zeros out the
254 // effect of the missing vertices in the triangle interpolation.
255 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
257 _mm_store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
261 // check for constant source overrides
262 if (IsSwizzledT::value
)
264 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
268 while (_BitScanForward(&comp
, mask
))
270 mask
&= ~(1 << comp
);
272 float constantValue
= 0.0f
;
273 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
275 case SWR_CONSTANT_SOURCE_CONST_0000
:
276 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
277 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
278 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
280 case SWR_CONSTANT_SOURCE_PRIM_ID
:
281 constantValue
= *(float*)&primId
;
285 // apply constant value to all 3 vertices
286 for (uint32_t v
= 0; v
< 3; ++v
)
288 pAttribStart
[comp
+ v
* 4] = constantValue
;
296 //////////////////////////////////////////////////////////////////////////
297 /// @brief Gather scissor rect data based on per-prim viewport indices.
298 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
299 /// @param pViewportIndex - array of per-primitive vewport indexes.
300 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
301 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
302 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
303 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
305 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
306 template<size_t SimdWidth
>
307 struct GatherScissors
309 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
310 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
311 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
313 SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
318 struct GatherScissors
<8>
320 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
321 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
322 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
324 scisXmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
325 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
326 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
327 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
328 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
329 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
330 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
331 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
);
332 scisYmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
333 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
334 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
335 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
336 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
337 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
338 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
339 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
);
340 scisXmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
341 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
342 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
343 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
344 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
345 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
346 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
347 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
);
348 scisYmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
349 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
350 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
351 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
352 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
353 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
354 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
355 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
);
359 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
361 struct ProcessAttributesChooser
363 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
365 template <typename
... ArgsB
>
366 static FuncType
GetFunc()
368 return ProcessAttributes
<ArgsB
...>;
372 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
374 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
377 //////////////////////////////////////////////////////////////////////////
378 /// @brief Processes enabled user clip distances. Loads the active clip
379 /// distances from the PA, sets up barycentric equations, and
380 /// stores the results to the output buffer
381 /// @param pa - Primitive Assembly state
382 /// @param primIndex - primitive index to process
383 /// @param clipDistMask - mask of enabled clip distances
384 /// @param pUserClipBuffer - buffer to store results
385 template<uint32_t NumVerts
>
386 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float *pRecipW
, float* pUserClipBuffer
)
389 while (_BitScanForward(&clipDist
, clipDistMask
))
391 clipDistMask
&= ~(1 << clipDist
);
392 uint32_t clipSlot
= clipDist
>> 2;
393 uint32_t clipComp
= clipDist
& 0x3;
394 uint32_t clipAttribSlot
= clipSlot
== 0 ?
395 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
397 __m128 primClipDist
[3];
398 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
400 float vertClipDist
[NumVerts
];
401 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
403 OSALIGNSIMD(float) aVertClipDist
[4];
404 _mm_store_ps(aVertClipDist
, primClipDist
[e
]);
405 vertClipDist
[e
] = aVertClipDist
[clipComp
];
408 // setup plane equations for barycentric interpolation in the backend
409 float baryCoeff
[NumVerts
];
410 float last
= vertClipDist
[NumVerts
- 1] * pRecipW
[NumVerts
- 1];
411 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
413 baryCoeff
[e
] = vertClipDist
[e
] * pRecipW
[e
] - last
;
415 baryCoeff
[NumVerts
- 1] = last
;
417 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
419 *(pUserClipBuffer
++) = baryCoeff
[e
];
424 //////////////////////////////////////////////////////////////////////////
425 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
426 /// culling, viewport transform, etc.
427 /// @param pDC - pointer to draw context.
428 /// @param pa - The primitive assembly object.
429 /// @param workerId - thread's worker id. Even thread has a unique id.
430 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
431 /// @param primID - Primitive ID for each triangle.
432 /// @param viewportIdx - viewport array index for each triangle.
433 /// @tparam CT - ConservativeRastFETraits
434 template <typename CT
>
442 simdscalari viewportIdx
)
444 SWR_CONTEXT
*pContext
= pDC
->pContext
;
446 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
448 const API_STATE
& state
= GetApiState(pDC
);
449 const SWR_RASTSTATE
& rastState
= state
.rastState
;
450 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
451 const SWR_GS_STATE
& gsState
= state
.gsState
;
452 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
454 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
455 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
456 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
458 if (feState
.vpTransformDisable
)
460 // RHW is passed in directly when VP transform is disabled
461 vRecipW0
= tri
[0].v
[3];
462 vRecipW1
= tri
[1].v
[3];
463 vRecipW2
= tri
[2].v
[3];
467 // Perspective divide
468 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
469 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
470 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
472 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
473 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
474 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
476 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
477 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
478 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
480 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
481 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
482 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
484 // Viewport transform to screen space coords
485 if (state
.gsState
.emitsViewportArrayIndex
)
487 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
491 viewportTransform
<3>(tri
, state
.vpMatrices
);
495 // Adjust for pixel center location
496 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
497 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
498 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
500 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
501 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
503 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
504 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
506 simdscalari vXi
[3], vYi
[3];
507 // Set vXi, vYi to required fixed point precision
508 FPToFixedPoint(tri
, vXi
, vYi
);
511 simdscalari vAi
[3], vBi
[3];
512 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
516 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
519 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
520 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
522 int cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
524 uint32_t origTriMask
= triMask
;
525 // don't cull degenerate triangles if we're conservatively rasterizing
526 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
528 triMask
&= ~cullZeroAreaMask
;
531 // determine front winding tris
534 // 0 area triangles are marked as backfacing regardless of winding order,
535 // which is required behavior for conservative rast and wireframe rendering
536 uint32_t frontWindingTris
;
537 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
539 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
540 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
544 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[0])));
545 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[1])));
547 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
551 switch ((SWR_CULLMODE
)rastState
.cullMode
)
553 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
554 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
555 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
556 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
557 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
558 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
561 triMask
&= ~cullTris
;
563 if (origTriMask
^ triMask
)
565 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
568 // Simple non-conformant wireframe mode, useful for debugging
569 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
571 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
573 simdscalar recipW
[2];
576 recipW
[0] = vRecipW0
;
577 recipW
[1] = vRecipW1
;
578 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
582 recipW
[0] = vRecipW1
;
583 recipW
[1] = vRecipW2
;
584 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
588 recipW
[0] = vRecipW2
;
589 recipW
[1] = vRecipW0
;
590 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
592 AR_END(FEBinTriangles
, 1);
596 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
597 // compute per tri backface
598 uint32_t frontFaceMask
= frontWindingTris
;
599 uint32_t *pPrimID
= (uint32_t *)&primID
;
600 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
602 // for center sample pattern, all samples are at pixel center; calculate coverage
603 // once at center and broadcast the results in the backend
604 const SWR_MULTISAMPLE_COUNT sampleCount
= (rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
) ? rastState
.sampleCount
: SWR_MULTISAMPLE_1X
;
606 PFN_WORK_FUNC pfnWork
;
607 if (CT::IsConservativeT::value
)
609 // determine which edges of the degenerate tri, if any, are valid to rasterize.
610 // used to call the appropriate templated rasterizer function
611 if (cullZeroAreaMask
> 0)
614 simdscalari x0x1Mask
= _simd_cmpeq_epi32(vXi
[0], vXi
[1]);
615 simdscalari y0y1Mask
= _simd_cmpeq_epi32(vYi
[0], vYi
[1]);
616 uint32_t e0Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask
, y0y1Mask
)));
619 simdscalari x1x2Mask
= _simd_cmpeq_epi32(vXi
[1], vXi
[2]);
620 simdscalari y1y2Mask
= _simd_cmpeq_epi32(vYi
[1], vYi
[2]);
621 uint32_t e1Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask
, y1y2Mask
)));
624 // if v0 == v1 & v1 == v2, v0 == v2
625 uint32_t e2Mask
= e0Mask
& e1Mask
;
626 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
628 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
629 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
630 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
631 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
632 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
633 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
634 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
636 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
640 edgeEnable
= 0x00FFFFFF;
645 // degenerate triangles won't be sent to rasterizer; just enable all edges
646 pfnWork
= GetRasterizerFunc(sampleCount
, (rastState
.conservativeRast
> 0),
647 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, ALL_EDGES_VALID
,
648 (state
.scissorsTileAligned
== false));
653 goto endBinTriangles
;
656 // Calc bounding box of triangles
658 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
660 // determine if triangle falls between pixel centers and discard
661 // only discard for non-MSAA case and when conservative rast is disabled
662 // (xmin + 127) & ~255
663 // (xmax + 128) & ~255
664 if (rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&& (!CT::IsConservativeT::value
))
666 origTriMask
= triMask
;
670 simdscalari xmin
= _simd_add_epi32(bbox
.xmin
, _simd_set1_epi32(127));
671 xmin
= _simd_and_si(xmin
, _simd_set1_epi32(~255));
672 simdscalari xmax
= _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(128));
673 xmax
= _simd_and_si(xmax
, _simd_set1_epi32(~255));
675 simdscalari vMaskH
= _simd_cmpeq_epi32(xmin
, xmax
);
677 simdscalari ymin
= _simd_add_epi32(bbox
.ymin
, _simd_set1_epi32(127));
678 ymin
= _simd_and_si(ymin
, _simd_set1_epi32(~255));
679 simdscalari ymax
= _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(128));
680 ymax
= _simd_and_si(ymax
, _simd_set1_epi32(~255));
682 simdscalari vMaskV
= _simd_cmpeq_epi32(ymin
, ymax
);
683 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
684 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
687 triMask
&= ~cullCenterMask
;
689 if (origTriMask
^ triMask
)
691 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
695 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
696 // Gather the AOS effective scissor rects based on the per-prim VP index.
697 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
698 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
699 if (state
.gsState
.emitsViewportArrayIndex
)
701 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
702 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
704 else // broadcast fast path for non-VPAI case.
706 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
707 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
708 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
709 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
712 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
713 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
714 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
715 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
717 if (CT::IsConservativeT::value
)
719 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
720 // some area. Bump the xmax/ymax edges out
721 simdscalari topEqualsBottom
= _simd_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
722 bbox
.ymax
= _simd_blendv_epi32(bbox
.ymax
, _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(1)), topEqualsBottom
);
723 simdscalari leftEqualsRight
= _simd_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
724 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(1)), leftEqualsRight
);
727 // Cull tris completely outside scissor
729 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
730 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
731 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
732 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
733 triMask
= triMask
& ~maskOutsideScissor
;
738 goto endBinTriangles
;
741 // Convert triangle bbox to macrotile units.
742 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
743 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
744 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
745 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
747 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
748 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
749 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
750 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
751 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
753 // transpose verts needed for backend
754 /// @todo modify BE to take non-transformed verts
755 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
756 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
757 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
758 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
759 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
761 // store render target array index
762 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
763 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
766 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
768 vRtaii
= _simd_castps_si(vRtai
[0].x
);
769 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
773 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
778 // scan remaining valid triangles and bin each separately
779 while (_BitScanForward(&triIndex
, triMask
))
781 uint32_t linkageCount
= state
.backendState
.numAttributes
;
782 uint32_t numScalarAttribs
= linkageCount
* 4;
788 if (CT::IsConservativeT::value
)
790 // only rasterize valid edges if we have a degenerate primitive
791 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
792 work
.pfnWork
= GetRasterizerFunc(sampleCount
, (rastState
.conservativeRast
> 0),
793 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, triEdgeEnable
,
794 (state
.scissorsTileAligned
== false));
796 // Degenerate triangles are required to be constant interpolated
797 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
801 isDegenerate
= false;
802 work
.pfnWork
= pfnWork
;
805 // Select attribute processor
806 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
807 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
809 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
811 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
812 desc
.triFlags
.primID
= pPrimID
[triIndex
];
813 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
814 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
816 auto pArena
= pDC
->pArena
;
817 SWR_ASSERT(pArena
!= nullptr);
819 // store active attribs
820 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
821 desc
.pAttribs
= pAttribs
;
822 desc
.numAttribs
= linkageCount
;
823 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
825 // store triangle vertex data
826 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
828 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
829 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
830 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
831 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
833 // store user clip distances
834 if (rastState
.clipDistanceMask
)
836 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
837 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
838 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
841 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
843 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
845 #if KNOB_ENABLE_TOSS_POINTS
846 if (!KNOB_TOSS_SETUP_TRIS
)
849 pTileMgr
->enqueue(x
, y
, &work
);
853 triMask
&= ~(1 << triIndex
);
856 AR_END(FEBinTriangles
, 1);
859 struct FEBinTrianglesChooser
861 typedef PFN_PROCESS_PRIMS FuncType
;
863 template <typename
... ArgsB
>
864 static FuncType
GetFunc()
866 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
870 // Selector for correct templated BinTrinagles function
871 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
873 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
877 //////////////////////////////////////////////////////////////////////////
878 /// @brief Bin SIMD points to the backend. Only supports point size of 1
879 /// @param pDC - pointer to draw context.
880 /// @param pa - The primitive assembly object.
881 /// @param workerId - thread's worker id. Even thread has a unique id.
882 /// @param tri - Contains point position data for SIMDs worth of points.
883 /// @param primID - Primitive ID for each point.
891 simdscalari viewportIdx
)
893 SWR_CONTEXT
*pContext
= pDC
->pContext
;
895 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
897 simdvector
& primVerts
= prim
[0];
899 const API_STATE
& state
= GetApiState(pDC
);
900 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
901 const SWR_GS_STATE
& gsState
= state
.gsState
;
902 const SWR_RASTSTATE
& rastState
= state
.rastState
;
903 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
905 // Select attribute processor
906 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
907 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
909 if (!feState
.vpTransformDisable
)
911 // perspective divide
912 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
913 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
914 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
915 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
917 // viewport transform to screen coords
918 if (state
.gsState
.emitsViewportArrayIndex
)
920 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
924 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
928 // adjust for pixel center location
929 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
930 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
931 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
933 // convert to fixed point
934 simdscalari vXi
, vYi
;
935 vXi
= fpToFixedPointVertical(primVerts
.x
);
936 vYi
= fpToFixedPointVertical(primVerts
.y
);
938 if (CanUseSimplePoints(pDC
))
940 // adjust for ymin-xmin rule
941 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
942 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
944 // cull points off the ymin-xmin edge of the viewport
945 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
946 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
948 // compute macro tile coordinates
949 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
950 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
952 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
953 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
954 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
956 // compute raster tile coordinates
957 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
958 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
960 // compute raster tile relative x,y for coverage mask
961 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
962 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
964 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
965 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
967 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
968 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
969 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
970 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
972 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
973 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
974 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
975 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
977 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
978 _simd_store_ps((float*)aZ
, primVerts
.z
);
980 // store render target array index
981 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
982 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
985 pa
.Assemble(VERTEX_RTAI_SLOT
, &vRtai
);
986 simdscalari vRtaii
= _simd_castps_si(vRtai
.x
);
987 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
991 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
994 uint32_t *pPrimID
= (uint32_t *)&primID
;
997 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
999 // scan remaining valid triangles and bin each separately
1000 while (_BitScanForward(&primIndex
, primMask
))
1002 uint32_t linkageCount
= backendState
.numAttributes
;
1003 uint32_t numScalarAttribs
= linkageCount
* 4;
1008 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1010 // points are always front facing
1011 desc
.triFlags
.frontFacing
= 1;
1012 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1013 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1014 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1016 work
.pfnWork
= RasterizeSimplePoint
;
1018 auto pArena
= pDC
->pArena
;
1019 SWR_ASSERT(pArena
!= nullptr);
1022 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1023 desc
.pAttribs
= pAttribs
;
1024 desc
.numAttribs
= linkageCount
;
1026 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1028 // store raster tile aligned x, y, perspective correct z
1029 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1030 desc
.pTriBuffer
= pTriBuffer
;
1031 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1032 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1033 *pTriBuffer
= aZ
[primIndex
];
1035 uint32_t tX
= aTileRelativeX
[primIndex
];
1036 uint32_t tY
= aTileRelativeY
[primIndex
];
1038 // pack the relative x,y into the coverageMask, the rasterizer will
1039 // generate the true coverage mask from it
1040 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1043 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1044 #if KNOB_ENABLE_TOSS_POINTS
1045 if (!KNOB_TOSS_SETUP_TRIS
)
1048 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1050 primMask
&= ~(1 << primIndex
);
1055 // non simple points need to be potentially binned to multiple macro tiles
1056 simdscalar vPointSize
;
1057 if (rastState
.pointParam
)
1060 pa
.Assemble(VERTEX_POINT_SIZE_SLOT
, size
);
1061 vPointSize
= size
[0].x
;
1065 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
1068 // bloat point to bbox
1070 bbox
.xmin
= bbox
.xmax
= vXi
;
1071 bbox
.ymin
= bbox
.ymax
= vYi
;
1073 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
1074 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1075 bbox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1076 bbox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
1077 bbox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1078 bbox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
1080 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1081 // Gather the AOS effective scissor rects based on the per-prim VP index.
1082 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1083 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1084 if (state
.gsState
.emitsViewportArrayIndex
)
1086 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1087 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1089 else // broadcast fast path for non-VPAI case.
1091 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1092 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1093 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1094 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1097 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
1098 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
1099 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
1100 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
1102 // Cull bloated points completely outside scissor
1103 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1104 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1105 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1106 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1107 primMask
= primMask
& ~maskOutsideScissor
;
1109 // Convert bbox to macrotile units.
1110 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1111 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1112 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1113 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1115 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1116 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
1117 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
1118 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
1119 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
1121 // store render target array index
1122 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1123 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1125 simdvector vRtai
[2];
1126 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
1127 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
1128 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1132 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1135 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
1136 _simd_store_ps((float*)aPointSize
, vPointSize
);
1138 uint32_t *pPrimID
= (uint32_t *)&primID
;
1140 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
1141 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
1142 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
1144 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
1145 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
1146 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
1148 // scan remaining valid prims and bin each separately
1149 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1151 while (_BitScanForward(&primIndex
, primMask
))
1153 uint32_t linkageCount
= backendState
.numAttributes
;
1154 uint32_t numScalarAttribs
= linkageCount
* 4;
1159 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1161 desc
.triFlags
.frontFacing
= 1;
1162 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1163 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1164 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1165 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1167 work
.pfnWork
= RasterizeTriPoint
;
1169 auto pArena
= pDC
->pArena
;
1170 SWR_ASSERT(pArena
!= nullptr);
1172 // store active attribs
1173 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1174 desc
.numAttribs
= linkageCount
;
1175 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1177 // store point vertex data
1178 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1179 desc
.pTriBuffer
= pTriBuffer
;
1180 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
1181 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
1182 *pTriBuffer
= aPrimVertsZ
[primIndex
];
1184 // store user clip distances
1185 if (rastState
.clipDistanceMask
)
1187 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1188 float one
[2] = {1.0f
, 1.0f
};
1189 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
1190 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, one
, desc
.pUserClipBuffer
);
1193 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1194 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1196 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1198 #if KNOB_ENABLE_TOSS_POINTS
1199 if (!KNOB_TOSS_SETUP_TRIS
)
1202 pTileMgr
->enqueue(x
, y
, &work
);
1207 primMask
&= ~(1 << primIndex
);
1211 AR_END(FEBinPoints
, 1);
1214 //////////////////////////////////////////////////////////////////////////
1215 /// @brief Bin SIMD lines to the backend.
1216 /// @param pDC - pointer to draw context.
1217 /// @param pa - The primitive assembly object.
1218 /// @param workerId - thread's worker id. Even thread has a unique id.
1219 /// @param tri - Contains line position data for SIMDs worth of points.
1220 /// @param primID - Primitive ID for each line.
1221 /// @param viewportIdx - Viewport Array Index for each line.
1222 void BinPostSetupLines(
1227 simdscalar recipW
[],
1230 simdscalari viewportIdx
)
1232 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1234 AR_BEGIN(FEBinLines
, pDC
->drawId
);
1236 const API_STATE
& state
= GetApiState(pDC
);
1237 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1238 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1239 const SWR_GS_STATE
& gsState
= state
.gsState
;
1241 // Select attribute processor
1242 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
1243 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1245 simdscalar
& vRecipW0
= recipW
[0];
1246 simdscalar
& vRecipW1
= recipW
[1];
1248 // convert to fixed point
1249 simdscalari vXi
[2], vYi
[2];
1250 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
1251 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
1252 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
1253 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
1255 // compute x-major vs y-major mask
1256 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
1257 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
1258 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
1259 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
1261 // cull zero-length lines
1262 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
1263 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
1265 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
1267 uint32_t *pPrimID
= (uint32_t *)&primID
;
1268 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1270 simdscalar vUnused
= _simd_setzero_ps();
1272 // Calc bounding box of lines
1274 bbox
.xmin
= _simd_min_epi32(vXi
[0], vXi
[1]);
1275 bbox
.xmax
= _simd_max_epi32(vXi
[0], vXi
[1]);
1276 bbox
.ymin
= _simd_min_epi32(vYi
[0], vYi
[1]);
1277 bbox
.ymax
= _simd_max_epi32(vYi
[0], vYi
[1]);
1279 // bloat bbox by line width along minor axis
1280 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
1281 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1283 bloatBox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1284 bloatBox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
1285 bloatBox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1286 bloatBox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
1288 bbox
.xmin
= _simd_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
1289 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
1290 bbox
.ymin
= _simd_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
1291 bbox
.ymax
= _simd_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
1293 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1294 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1295 if (state
.gsState
.emitsViewportArrayIndex
)
1297 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1298 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1300 else // broadcast fast path for non-VPAI case.
1302 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1303 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1304 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1305 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1308 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
1309 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
1310 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
1311 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
1313 // Cull prims completely outside scissor
1315 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1316 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1317 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1318 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1319 primMask
= primMask
& ~maskOutsideScissor
;
1327 // Convert triangle bbox to macrotile units.
1328 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1329 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1330 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1331 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1333 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1334 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
1335 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
1336 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
1337 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
1339 // transpose verts needed for backend
1340 /// @todo modify BE to take non-transformed verts
1341 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
1342 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
1343 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
1344 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
1345 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
1347 // store render target array index
1348 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1349 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1351 simdvector vRtai
[2];
1352 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
1353 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
1354 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1358 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1361 // scan remaining valid prims and bin each separately
1363 while (_BitScanForward(&primIndex
, primMask
))
1365 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1366 uint32_t numScalarAttribs
= linkageCount
* 4;
1371 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1373 desc
.triFlags
.frontFacing
= 1;
1374 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1375 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
1376 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1377 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1379 work
.pfnWork
= RasterizeLine
;
1381 auto pArena
= pDC
->pArena
;
1382 SWR_ASSERT(pArena
!= nullptr);
1384 // store active attribs
1385 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1386 desc
.numAttribs
= linkageCount
;
1387 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1389 // store line vertex data
1390 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1391 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
1392 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
1393 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
1394 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
1396 // store user clip distances
1397 if (rastState
.clipDistanceMask
)
1399 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1400 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
1401 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1404 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1405 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1407 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1409 #if KNOB_ENABLE_TOSS_POINTS
1410 if (!KNOB_TOSS_SETUP_TRIS
)
1413 pTileMgr
->enqueue(x
, y
, &work
);
1418 primMask
&= ~(1 << primIndex
);
1423 AR_END(FEBinLines
, 1);
1426 //////////////////////////////////////////////////////////////////////////
1427 /// @brief Bin SIMD lines to the backend.
1428 /// @param pDC - pointer to draw context.
1429 /// @param pa - The primitive assembly object.
1430 /// @param workerId - thread's worker id. Even thread has a unique id.
1431 /// @param tri - Contains line position data for SIMDs worth of points.
1432 /// @param primID - Primitive ID for each line.
1433 /// @param viewportIdx - Viewport Array Index for each line.
1441 simdscalari viewportIdx
)
1443 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1445 const API_STATE
& state
= GetApiState(pDC
);
1446 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1447 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1448 const SWR_GS_STATE
& gsState
= state
.gsState
;
1450 // Select attribute processor
1451 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
1452 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1454 simdscalar vRecipW
[2] = { _simd_set1_ps(1.0f
), _simd_set1_ps(1.0f
) };
1456 if (!feState
.vpTransformDisable
)
1458 // perspective divide
1459 vRecipW
[0] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
1460 vRecipW
[1] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
1462 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW
[0]);
1463 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW
[1]);
1465 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW
[0]);
1466 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW
[1]);
1468 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW
[0]);
1469 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW
[1]);
1471 // viewport transform to screen coords
1472 if (state
.gsState
.emitsViewportArrayIndex
)
1474 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
1478 viewportTransform
<2>(prim
, state
.vpMatrices
);
1482 // adjust for pixel center location
1483 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1484 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
1485 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
1487 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
1488 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);