1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for the macrotile binner
27 ******************************************************************************/
32 #include "conservativeRast.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
39 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
40 void BinPostSetupLinesImpl(
45 Float
<SIMD_T
> recipW
[],
47 Integer
<SIMD_T
> const &primID
,
48 Integer
<SIMD_T
> const &viewportIdx
,
49 Integer
<SIMD_T
> const &rtIdx
);
51 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
52 void BinPostSetupPointsImpl(
58 Integer
<SIMD_T
> const &primID
,
59 Integer
<SIMD_T
> const &viewportIdx
,
60 Integer
<SIMD_T
> const &rtIdx
);
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief Processes attributes for the backend based on linkage mask and
64 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
65 /// @param pDC - Draw context
66 /// @param pa - Primitive Assembly state
67 /// @param linkageMask - Specifies which VS outputs are routed to PS.
68 /// @param pLinkageMap - maps VS attribute slot to PS slot
69 /// @param triIndex - Triangle to process attributes for
70 /// @param pBuffer - Output result
71 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
72 INLINE
void ProcessAttributes(
79 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
80 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
81 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
82 uint32_t constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
83 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
84 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
86 static const float constTable
[3][4] = {
87 { 0.0f
, 0.0f
, 0.0f
, 0.0f
},
88 { 0.0f
, 0.0f
, 0.0f
, 1.0f
},
89 { 1.0f
, 1.0f
, 1.0f
, 1.0f
}
92 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
95 if (IsSwizzledT::value
)
97 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
98 inputSlot
= backendState
.vertexAttribOffset
+ attribSwizzle
.sourceAttrib
;
103 inputSlot
= backendState
.vertexAttribOffset
+ i
;
106 simd4scalar attrib
[3]; // triangle attribs (always 4 wide)
107 float* pAttribStart
= pBuffer
;
109 if (HasConstantInterpT::value
|| IsDegenerate::value
)
111 if (CheckBit(constantInterpMask
, i
))
114 uint32_t adjustedTriIndex
;
115 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
116 static const int32_t quadProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
117 static const uint32_t quadProvokingVertex
[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
118 static const int32_t qstripProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
119 static const uint32_t qstripProvokingVertex
[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
123 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
124 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
127 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
128 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
130 case TOP_TRIANGLE_STRIP
:
131 adjustedTriIndex
= triIndex
;
133 ? tristripProvokingVertex
[provokingVertex
]
137 adjustedTriIndex
= triIndex
;
138 vid
= provokingVertex
;
142 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
144 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
146 SIMD128::store_ps(pBuffer
, attrib
[vid
]);
152 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
154 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
156 SIMD128::store_ps(pBuffer
, attrib
[i
]);
163 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
165 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
167 SIMD128::store_ps(pBuffer
, attrib
[i
]);
172 // pad out the attrib buffer to 3 verts to ensure the triangle
173 // interpolation code in the pixel shader works correctly for the
174 // 3 topologies - point, line, tri. This effectively zeros out the
175 // effect of the missing vertices in the triangle interpolation.
176 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
178 SIMD128::store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
182 // check for constant source overrides
183 if (IsSwizzledT::value
)
185 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
189 while (_BitScanForward(&comp
, mask
))
191 mask
&= ~(1 << comp
);
193 float constantValue
= 0.0f
;
194 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
196 case SWR_CONSTANT_SOURCE_CONST_0000
:
197 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
198 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
199 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
201 case SWR_CONSTANT_SOURCE_PRIM_ID
:
202 constantValue
= *(float*)&primId
;
206 // apply constant value to all 3 vertices
207 for (uint32_t v
= 0; v
< 3; ++v
)
209 pAttribStart
[comp
+ v
* 4] = constantValue
;
217 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
219 struct ProcessAttributesChooser
221 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
223 template <typename
... ArgsB
>
224 static FuncType
GetFunc()
226 return ProcessAttributes
<ArgsB
...>;
230 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
232 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Processes enabled user clip distances. Loads the active clip
237 /// distances from the PA, sets up barycentric equations, and
238 /// stores the results to the output buffer
239 /// @param pa - Primitive Assembly state
240 /// @param primIndex - primitive index to process
241 /// @param clipDistMask - mask of enabled clip distances
242 /// @param pUserClipBuffer - buffer to store results
243 template<uint32_t NumVerts
>
244 void ProcessUserClipDist(const SWR_BACKEND_STATE
& state
, PA_STATE
& pa
, uint32_t primIndex
, float *pRecipW
, float* pUserClipBuffer
)
247 uint32_t clipDistMask
= state
.clipDistanceMask
;
248 while (_BitScanForward(&clipDist
, clipDistMask
))
250 clipDistMask
&= ~(1 << clipDist
);
251 uint32_t clipSlot
= clipDist
>> 2;
252 uint32_t clipComp
= clipDist
& 0x3;
253 uint32_t clipAttribSlot
= clipSlot
== 0 ?
254 state
.vertexClipCullOffset
: state
.vertexClipCullOffset
+ 1;
256 simd4scalar primClipDist
[3];
257 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
259 float vertClipDist
[NumVerts
];
260 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
262 OSALIGNSIMD(float) aVertClipDist
[4];
263 SIMD128::store_ps(aVertClipDist
, primClipDist
[e
]);
264 vertClipDist
[e
] = aVertClipDist
[clipComp
];
267 // setup plane equations for barycentric interpolation in the backend
268 float baryCoeff
[NumVerts
];
269 float last
= vertClipDist
[NumVerts
- 1] * pRecipW
[NumVerts
- 1];
270 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
272 baryCoeff
[e
] = vertClipDist
[e
] * pRecipW
[e
] - last
;
274 baryCoeff
[NumVerts
- 1] = last
;
276 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
278 *(pUserClipBuffer
++) = baryCoeff
[e
];
284 void TransposeVertices(simd4scalar(&dst
)[8], const simdscalar
&src0
, const simdscalar
&src1
, const simdscalar
&src2
)
286 vTranspose3x8(dst
, src0
, src1
, src2
);
290 void TransposeVertices(simd4scalar(&dst
)[16], const simd16scalar
&src0
, const simd16scalar
&src1
, const simd16scalar
&src2
)
292 vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst
), src0
, src1
, src2
, _simd16_setzero_ps());
296 #if KNOB_ENABLE_EARLY_RAST
298 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
299 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
302 template<typename SIMD_T
>
303 struct EarlyRastHelper
308 struct EarlyRastHelper
<SIMD256
>
310 static SIMD256::Integer
InitShiftCntrl()
312 return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
316 #if USE_SIMD16_FRONTEND
318 struct EarlyRastHelper
<SIMD512
>
320 static SIMD512::Integer
InitShiftCntrl()
322 return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
327 //////////////////////////////////////////////////////////////////////////
328 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
329 /// (ER tile) can be rasterized as early as in binner to check if
330 /// they cover any pixels. If not - the triangles can be
331 /// culled in binner.
333 /// @param er_bbox - coordinates of ER tile for each triangle
334 /// @param vAi - A coefficients of triangle edges
335 /// @param vBi - B coefficients of triangle edges
336 /// @param vXi - X coordinates of triangle vertices
337 /// @param vYi - Y coordinates of triangle vertices
338 /// @param frontWindingTris - mask indicating CCW/CW triangles
339 /// @param triMask - mask for valid SIMD lanes (triangles)
340 /// @param oneTileMask - defines triangles for ER to work on
341 /// (tris that fit into ER tile)
342 template <typename SIMD_T
, uint32_t SIMD_WIDTH
, typename CT
>
343 uint32_t SIMDCALL
EarlyRasterizer(
344 SIMDBBOX_T
<SIMD_T
> &er_bbox
,
345 Integer
<SIMD_T
> (&vAi
)[3],
346 Integer
<SIMD_T
> (&vBi
)[3],
347 Integer
<SIMD_T
> (&vXi
)[3],
348 Integer
<SIMD_T
> (&vYi
)[3],
351 uint32_t oneTileMask
)
353 // step to pixel center of top-left pixel of the triangle bbox
354 Integer
<SIMD_T
> vTopLeftX
= SIMD_T::template slli_epi32
<ER_SIMD_TILE_X_SHIFT
+ FIXED_POINT_SHIFT
>(er_bbox
.xmin
);
355 vTopLeftX
= SIMD_T::add_epi32(vTopLeftX
, SIMD_T::set1_epi32(FIXED_POINT_SCALE
/ 2));
357 Integer
<SIMD_T
> vTopLeftY
= SIMD_T::template slli_epi32
<ER_SIMD_TILE_Y_SHIFT
+ FIXED_POINT_SHIFT
>(er_bbox
.ymin
);
358 vTopLeftY
= SIMD_T::add_epi32(vTopLeftY
, SIMD_T::set1_epi32(FIXED_POINT_SCALE
/ 2));
360 // negate A and B for CW tris
361 Integer
<SIMD_T
> vNegA0
= SIMD_T::mullo_epi32(vAi
[0], SIMD_T::set1_epi32(-1));
362 Integer
<SIMD_T
> vNegA1
= SIMD_T::mullo_epi32(vAi
[1], SIMD_T::set1_epi32(-1));
363 Integer
<SIMD_T
> vNegA2
= SIMD_T::mullo_epi32(vAi
[2], SIMD_T::set1_epi32(-1));
364 Integer
<SIMD_T
> vNegB0
= SIMD_T::mullo_epi32(vBi
[0], SIMD_T::set1_epi32(-1));
365 Integer
<SIMD_T
> vNegB1
= SIMD_T::mullo_epi32(vBi
[1], SIMD_T::set1_epi32(-1));
366 Integer
<SIMD_T
> vNegB2
= SIMD_T::mullo_epi32(vBi
[2], SIMD_T::set1_epi32(-1));
368 RDTSC_EVENT(FEEarlyRastEnter
, _mm_popcnt_u32(oneTileMask
& triMask
), 0);
370 Integer
<SIMD_T
> vShiftCntrl
= EarlyRastHelper
<SIMD_T
>::InitShiftCntrl();
371 Integer
<SIMD_T
> vCwTris
= SIMD_T::set1_epi32(cwTrisMask
);
372 Integer
<SIMD_T
> vMask
= SIMD_T::sllv_epi32(vCwTris
, vShiftCntrl
);
374 vAi
[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi
[0]), SIMD_T::castsi_ps(vNegA0
), SIMD_T::castsi_ps(vMask
)));
375 vAi
[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi
[1]), SIMD_T::castsi_ps(vNegA1
), SIMD_T::castsi_ps(vMask
)));
376 vAi
[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi
[2]), SIMD_T::castsi_ps(vNegA2
), SIMD_T::castsi_ps(vMask
)));
377 vBi
[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi
[0]), SIMD_T::castsi_ps(vNegB0
), SIMD_T::castsi_ps(vMask
)));
378 vBi
[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi
[1]), SIMD_T::castsi_ps(vNegB1
), SIMD_T::castsi_ps(vMask
)));
379 vBi
[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi
[2]), SIMD_T::castsi_ps(vNegB2
), SIMD_T::castsi_ps(vMask
)));
381 // evaluate edge equations at top-left pixel
382 Integer
<SIMD_T
> vDeltaX0
= SIMD_T::sub_epi32(vTopLeftX
, vXi
[0]);
383 Integer
<SIMD_T
> vDeltaX1
= SIMD_T::sub_epi32(vTopLeftX
, vXi
[1]);
384 Integer
<SIMD_T
> vDeltaX2
= SIMD_T::sub_epi32(vTopLeftX
, vXi
[2]);
386 Integer
<SIMD_T
> vDeltaY0
= SIMD_T::sub_epi32(vTopLeftY
, vYi
[0]);
387 Integer
<SIMD_T
> vDeltaY1
= SIMD_T::sub_epi32(vTopLeftY
, vYi
[1]);
388 Integer
<SIMD_T
> vDeltaY2
= SIMD_T::sub_epi32(vTopLeftY
, vYi
[2]);
390 Integer
<SIMD_T
> vAX0
= SIMD_T::mullo_epi32(vAi
[0], vDeltaX0
);
391 Integer
<SIMD_T
> vAX1
= SIMD_T::mullo_epi32(vAi
[1], vDeltaX1
);
392 Integer
<SIMD_T
> vAX2
= SIMD_T::mullo_epi32(vAi
[2], vDeltaX2
);
394 Integer
<SIMD_T
> vBY0
= SIMD_T::mullo_epi32(vBi
[0], vDeltaY0
);
395 Integer
<SIMD_T
> vBY1
= SIMD_T::mullo_epi32(vBi
[1], vDeltaY1
);
396 Integer
<SIMD_T
> vBY2
= SIMD_T::mullo_epi32(vBi
[2], vDeltaY2
);
398 Integer
<SIMD_T
> vEdge0
= SIMD_T::add_epi32(vAX0
, vBY0
);
399 Integer
<SIMD_T
> vEdge1
= SIMD_T::add_epi32(vAX1
, vBY1
);
400 Integer
<SIMD_T
> vEdge2
= SIMD_T::add_epi32(vAX2
, vBY2
);
402 vEdge0
= SIMD_T::template srai_epi32
<FIXED_POINT_SHIFT
>(vEdge0
);
403 vEdge1
= SIMD_T::template srai_epi32
<FIXED_POINT_SHIFT
>(vEdge1
);
404 vEdge2
= SIMD_T::template srai_epi32
<FIXED_POINT_SHIFT
>(vEdge2
);
407 Integer
<SIMD_T
> vEdgeAdjust0
= SIMD_T::sub_epi32(vEdge0
, SIMD_T::set1_epi32(1));
408 Integer
<SIMD_T
> vEdgeAdjust1
= SIMD_T::sub_epi32(vEdge1
, SIMD_T::set1_epi32(1));
409 Integer
<SIMD_T
> vEdgeAdjust2
= SIMD_T::sub_epi32(vEdge2
, SIMD_T::set1_epi32(1));
412 vEdge0
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0
), SIMD_T::castsi_ps(vEdgeAdjust0
), SIMD_T::castsi_ps(vAi
[0])));
413 vEdge1
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1
), SIMD_T::castsi_ps(vEdgeAdjust1
), SIMD_T::castsi_ps(vAi
[1])));
414 vEdge2
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2
), SIMD_T::castsi_ps(vEdgeAdjust2
), SIMD_T::castsi_ps(vAi
[2])));
417 Integer
<SIMD_T
> vCmp0
= SIMD_T::cmpeq_epi32(vAi
[0], SIMD_T::setzero_si());
418 Integer
<SIMD_T
> vCmp1
= SIMD_T::cmpeq_epi32(vAi
[1], SIMD_T::setzero_si());
419 Integer
<SIMD_T
> vCmp2
= SIMD_T::cmpeq_epi32(vAi
[2], SIMD_T::setzero_si());
421 vCmp0
= SIMD_T::and_si(vCmp0
, vBi
[0]);
422 vCmp1
= SIMD_T::and_si(vCmp1
, vBi
[1]);
423 vCmp2
= SIMD_T::and_si(vCmp2
, vBi
[2]);
425 vEdge0
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0
), SIMD_T::castsi_ps(vEdgeAdjust0
), SIMD_T::castsi_ps(vCmp0
)));
426 vEdge1
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1
), SIMD_T::castsi_ps(vEdgeAdjust1
), SIMD_T::castsi_ps(vCmp1
)));
427 vEdge2
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2
), SIMD_T::castsi_ps(vEdgeAdjust2
), SIMD_T::castsi_ps(vCmp2
)));
430 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
433 Integer
<SIMD_T
> vMask0
= SIMD_T::and_si(vEdge0
, vEdge1
);
434 vMask0
= SIMD_T::and_si(vMask0
, vEdge2
);
437 Integer
<SIMD_T
> vEdge0N
= SIMD_T::add_epi32(vEdge0
, vBi
[0]);
438 Integer
<SIMD_T
> vEdge1N
= SIMD_T::add_epi32(vEdge1
, vBi
[1]);
439 Integer
<SIMD_T
> vEdge2N
= SIMD_T::add_epi32(vEdge2
, vBi
[2]);
440 Integer
<SIMD_T
> vMask1
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
441 vMask1
= SIMD_T::and_si(vMask1
, vEdge2N
);
444 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vBi
[0]);
445 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vBi
[1]);
446 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vBi
[2]);
447 Integer
<SIMD_T
> vMask2
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
448 vMask2
= SIMD_T::and_si(vMask2
, vEdge2N
);
451 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vBi
[0]);
452 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vBi
[1]);
453 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vBi
[2]);
454 Integer
<SIMD_T
> vMask3
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
455 vMask3
= SIMD_T::and_si(vMask3
, vEdge2N
);
457 // One step to the right and then up
460 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vAi
[0]);
461 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vAi
[1]);
462 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vAi
[2]);
463 Integer
<SIMD_T
> vMask4
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
464 vMask4
= SIMD_T::and_si(vMask4
, vEdge2N
);
467 vEdge0N
= SIMD_T::sub_epi32(vEdge0N
, vBi
[0]);
468 vEdge1N
= SIMD_T::sub_epi32(vEdge1N
, vBi
[1]);
469 vEdge2N
= SIMD_T::sub_epi32(vEdge2N
, vBi
[2]);
470 Integer
<SIMD_T
> vMask5
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
471 vMask5
= SIMD_T::and_si(vMask5
, vEdge2N
);
474 vEdge0N
= SIMD_T::sub_epi32(vEdge0N
, vBi
[0]);
475 vEdge1N
= SIMD_T::sub_epi32(vEdge1N
, vBi
[1]);
476 vEdge2N
= SIMD_T::sub_epi32(vEdge2N
, vBi
[2]);
477 Integer
<SIMD_T
> vMask6
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
478 vMask6
= SIMD_T::and_si(vMask6
, vEdge2N
);
481 vEdge0N
= SIMD_T::sub_epi32(vEdge0N
, vBi
[0]);
482 vEdge1N
= SIMD_T::sub_epi32(vEdge1N
, vBi
[1]);
483 vEdge2N
= SIMD_T::sub_epi32(vEdge2N
, vBi
[2]);
484 Integer
<SIMD_T
> vMask7
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
485 vMask7
= SIMD_T::and_si(vMask7
, vEdge2N
);
487 Integer
<SIMD_T
> vLit1
= SIMD_T::or_si(vMask0
, vMask1
);
488 vLit1
= SIMD_T::or_si(vLit1
, vMask2
);
489 vLit1
= SIMD_T::or_si(vLit1
, vMask3
);
490 vLit1
= SIMD_T::or_si(vLit1
, vMask4
);
491 vLit1
= SIMD_T::or_si(vLit1
, vMask5
);
492 vLit1
= SIMD_T::or_si(vLit1
, vMask6
);
493 vLit1
= SIMD_T::or_si(vLit1
, vMask7
);
495 // Step to the right and go down again
498 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vAi
[0]);
499 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vAi
[1]);
500 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vAi
[2]);
501 vMask0
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
502 vMask0
= SIMD_T::and_si(vMask0
, vEdge2N
);
505 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vBi
[0]);
506 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vBi
[1]);
507 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vBi
[2]);
508 vMask1
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
509 vMask1
= SIMD_T::and_si(vMask1
, vEdge2N
);
512 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vBi
[0]);
513 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vBi
[1]);
514 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vBi
[2]);
515 vMask2
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
516 vMask2
= SIMD_T::and_si(vMask2
, vEdge2N
);
519 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vBi
[0]);
520 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vBi
[1]);
521 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vBi
[2]);
522 vMask3
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
523 vMask3
= SIMD_T::and_si(vMask3
, vEdge2N
);
525 // And for the last time - to the right and up
528 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vAi
[0]);
529 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vAi
[1]);
530 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vAi
[2]);
531 vMask4
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
532 vMask4
= SIMD_T::and_si(vMask4
, vEdge2N
);
535 vEdge0N
= SIMD_T::sub_epi32(vEdge0N
, vBi
[0]);
536 vEdge1N
= SIMD_T::sub_epi32(vEdge1N
, vBi
[1]);
537 vEdge2N
= SIMD_T::sub_epi32(vEdge2N
, vBi
[2]);
538 vMask5
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
539 vMask5
= SIMD_T::and_si(vMask5
, vEdge2N
);
542 vEdge0N
= SIMD_T::sub_epi32(vEdge0N
, vBi
[0]);
543 vEdge1N
= SIMD_T::sub_epi32(vEdge1N
, vBi
[1]);
544 vEdge2N
= SIMD_T::sub_epi32(vEdge2N
, vBi
[2]);
545 vMask6
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
546 vMask6
= SIMD_T::and_si(vMask6
, vEdge2N
);
549 vEdge0N
= SIMD_T::sub_epi32(vEdge0N
, vBi
[0]);
550 vEdge1N
= SIMD_T::sub_epi32(vEdge1N
, vBi
[1]);
551 vEdge2N
= SIMD_T::sub_epi32(vEdge2N
, vBi
[2]);
552 vMask7
= SIMD_T::and_si(vEdge0N
, vEdge1N
);
553 vMask7
= SIMD_T::and_si(vMask7
, vEdge2N
);
555 Integer
<SIMD_T
> vLit2
= SIMD_T::or_si(vMask0
, vMask1
);
556 vLit2
= SIMD_T::or_si(vLit2
, vMask2
);
557 vLit2
= SIMD_T::or_si(vLit2
, vMask3
);
558 vLit2
= SIMD_T::or_si(vLit2
, vMask4
);
559 vLit2
= SIMD_T::or_si(vLit2
, vMask5
);
560 vLit2
= SIMD_T::or_si(vLit2
, vMask6
);
561 vLit2
= SIMD_T::or_si(vLit2
, vMask7
);
563 Integer
<SIMD_T
> vLit
= SIMD_T::or_si(vLit1
, vLit2
);
566 // Generic algorithm sweeping in row by row order
567 Integer
<SIMD_T
> vRowMask
[ER_SIMD_TILE_Y_DIM
];
569 Integer
<SIMD_T
> vEdge0N
= vEdge0
;
570 Integer
<SIMD_T
> vEdge1N
= vEdge1
;
571 Integer
<SIMD_T
> vEdge2N
= vEdge2
;
573 for (uint32_t row
= 0; row
< ER_SIMD_TILE_Y_DIM
; row
++)
575 // Store edge values at the beginning of the row
576 Integer
<SIMD_T
> vRowEdge0
= vEdge0N
;
577 Integer
<SIMD_T
> vRowEdge1
= vEdge1N
;
578 Integer
<SIMD_T
> vRowEdge2
= vEdge2N
;
580 Integer
<SIMD_T
> vColMask
[ER_SIMD_TILE_X_DIM
];
582 for (uint32_t col
= 0; col
< ER_SIMD_TILE_X_DIM
; col
++)
584 vColMask
[col
] = SIMD_T::and_si(vEdge0N
, vEdge1N
);
585 vColMask
[col
] = SIMD_T::and_si(vColMask
[col
], vEdge2N
);
587 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vAi
[0]);
588 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vAi
[1]);
589 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vAi
[2]);
591 vRowMask
[row
] = vColMask
[0];
592 for (uint32_t col
= 1; col
< ER_SIMD_TILE_X_DIM
; col
++)
594 vRowMask
[row
] = SIMD_T::or_si(vRowMask
[row
], vColMask
[col
]);
596 // Restore values and go to the next row
601 vEdge0N
= SIMD_T::add_epi32(vEdge0N
, vBi
[0]);
602 vEdge1N
= SIMD_T::add_epi32(vEdge1N
, vBi
[1]);
603 vEdge2N
= SIMD_T::add_epi32(vEdge2N
, vBi
[2]);
606 // compress all masks
607 Integer
<SIMD_T
> vLit
= vRowMask
[0];
608 for (uint32_t row
= 1; row
< ER_SIMD_TILE_Y_DIM
; row
++)
610 vLit
= SIMD_T::or_si(vLit
, vRowMask
[row
]);
614 // Check which triangles has any pixel lit
615 uint32_t maskLit
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit
));
616 uint32_t maskUnlit
= ~maskLit
& oneTileMask
;
618 uint32_t oldTriMask
= triMask
;
619 triMask
&= ~maskUnlit
;
621 if (triMask
^ oldTriMask
)
623 RDTSC_EVENT(FEEarlyRastExit
, _mm_popcnt_u32(triMask
& oneTileMask
), 0);
628 #endif // Early rasterizer
630 //////////////////////////////////////////////////////////////////////////
631 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
632 /// culling, viewport transform, etc.
633 /// @param pDC - pointer to draw context.
634 /// @param pa - The primitive assembly object.
635 /// @param workerId - thread's worker id. Even thread has a unique id.
636 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
637 /// @param primID - Primitive ID for each triangle.
638 /// @param viewportIdx - viewport array index for each triangle.
639 /// @tparam CT - ConservativeRastFETraits
640 template <typename SIMD_T
, uint32_t SIMD_WIDTH
, typename CT
>
641 void SIMDCALL
BinTrianglesImpl(
647 Integer
<SIMD_T
> const &primID
,
648 Integer
<SIMD_T
> const &viewportIdx
,
649 Integer
<SIMD_T
> const &rtIdx
)
651 const uint32_t *aRTAI
= reinterpret_cast<const uint32_t *>(&rtIdx
);
653 RDTSC_BEGIN(FEBinTriangles
, pDC
->drawId
);
655 const API_STATE
& state
= GetApiState(pDC
);
656 const SWR_RASTSTATE
& rastState
= state
.rastState
;
657 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
659 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
661 Float
<SIMD_T
> vRecipW0
= SIMD_T::set1_ps(1.0f
);
662 Float
<SIMD_T
> vRecipW1
= SIMD_T::set1_ps(1.0f
);
663 Float
<SIMD_T
> vRecipW2
= SIMD_T::set1_ps(1.0f
);
665 if (feState
.vpTransformDisable
)
667 // RHW is passed in directly when VP transform is disabled
668 vRecipW0
= tri
[0].v
[3];
669 vRecipW1
= tri
[1].v
[3];
670 vRecipW2
= tri
[2].v
[3];
674 // Perspective divide
675 vRecipW0
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), tri
[0].w
);
676 vRecipW1
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), tri
[1].w
);
677 vRecipW2
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), tri
[2].w
);
679 tri
[0].v
[0] = SIMD_T::mul_ps(tri
[0].v
[0], vRecipW0
);
680 tri
[1].v
[0] = SIMD_T::mul_ps(tri
[1].v
[0], vRecipW1
);
681 tri
[2].v
[0] = SIMD_T::mul_ps(tri
[2].v
[0], vRecipW2
);
683 tri
[0].v
[1] = SIMD_T::mul_ps(tri
[0].v
[1], vRecipW0
);
684 tri
[1].v
[1] = SIMD_T::mul_ps(tri
[1].v
[1], vRecipW1
);
685 tri
[2].v
[1] = SIMD_T::mul_ps(tri
[2].v
[1], vRecipW2
);
687 tri
[0].v
[2] = SIMD_T::mul_ps(tri
[0].v
[2], vRecipW0
);
688 tri
[1].v
[2] = SIMD_T::mul_ps(tri
[1].v
[2], vRecipW1
);
689 tri
[2].v
[2] = SIMD_T::mul_ps(tri
[2].v
[2], vRecipW2
);
691 // Viewport transform to screen space coords
692 if (pa
.viewportArrayActive
)
694 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
698 viewportTransform
<3>(tri
, state
.vpMatrices
);
702 // Adjust for pixel center location
703 Float
<SIMD_T
> offset
= SwrPixelOffsets
<SIMD_T
>::GetOffset(rastState
.pixelLocation
);
705 tri
[0].x
= SIMD_T::add_ps(tri
[0].x
, offset
);
706 tri
[0].y
= SIMD_T::add_ps(tri
[0].y
, offset
);
708 tri
[1].x
= SIMD_T::add_ps(tri
[1].x
, offset
);
709 tri
[1].y
= SIMD_T::add_ps(tri
[1].y
, offset
);
711 tri
[2].x
= SIMD_T::add_ps(tri
[2].x
, offset
);
712 tri
[2].y
= SIMD_T::add_ps(tri
[2].y
, offset
);
714 // Set vXi, vYi to required fixed point precision
715 Integer
<SIMD_T
> vXi
[3], vYi
[3];
716 FPToFixedPoint
<SIMD_T
>(tri
, vXi
, vYi
);
719 Integer
<SIMD_T
> vAi
[3], vBi
[3];
720 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
723 Integer
<SIMD_T
> vDet
[2];
724 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
727 uint32_t maskLo
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet
[0], SIMD_T::setzero_si())));
728 uint32_t maskHi
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet
[1], SIMD_T::setzero_si())));
730 uint32_t cullZeroAreaMask
= maskLo
| (maskHi
<< (SIMD_WIDTH
/ 2));
732 // don't cull degenerate triangles if we're conservatively rasterizing
733 uint32_t origTriMask
= triMask
;
734 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
736 triMask
&= ~cullZeroAreaMask
;
739 // determine front winding tris
742 // 0 area triangles are marked as backfacing regardless of winding order,
743 // which is required behavior for conservative rast and wireframe rendering
744 uint32_t frontWindingTris
;
745 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
747 maskLo
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet
[0], SIMD_T::setzero_si())));
748 maskHi
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet
[1], SIMD_T::setzero_si())));
752 maskLo
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet
[0])));
753 maskHi
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet
[1])));
755 frontWindingTris
= maskLo
| (maskHi
<< (SIMD_WIDTH
/ 2));
759 switch ((SWR_CULLMODE
)rastState
.cullMode
)
761 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
762 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
763 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
764 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
765 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
766 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
769 triMask
&= ~cullTris
;
771 if (origTriMask
^ triMask
)
773 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
776 AR_EVENT(CullInfoEvent(pDC
->drawId
, cullZeroAreaMask
, cullTris
, origTriMask
));
778 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
779 // compute per tri backface
780 uint32_t frontFaceMask
= frontWindingTris
;
781 uint32_t *pPrimID
= (uint32_t *)&primID
;
782 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
786 PFN_WORK_FUNC pfnWork
;
787 if (CT::IsConservativeT::value
)
789 // determine which edges of the degenerate tri, if any, are valid to rasterize.
790 // used to call the appropriate templated rasterizer function
791 if (cullZeroAreaMask
> 0)
794 const Integer
<SIMD_T
> x0x1Mask
= SIMD_T::cmpeq_epi32(vXi
[0], vXi
[1]);
795 const Integer
<SIMD_T
> y0y1Mask
= SIMD_T::cmpeq_epi32(vYi
[0], vYi
[1]);
797 uint32_t e0Mask
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask
, y0y1Mask
)));
800 const Integer
<SIMD_T
> x1x2Mask
= SIMD_T::cmpeq_epi32(vXi
[1], vXi
[2]);
801 const Integer
<SIMD_T
> y1y2Mask
= SIMD_T::cmpeq_epi32(vYi
[1], vYi
[2]);
803 uint32_t e1Mask
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask
, y1y2Mask
)));
806 // if v0 == v1 & v1 == v2, v0 == v2
807 uint32_t e2Mask
= e0Mask
& e1Mask
;
808 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
810 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
811 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
812 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
814 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
815 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
817 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
818 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
820 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
824 edgeEnable
= 0x00FFFFFF;
829 // degenerate triangles won't be sent to rasterizer; just enable all edges
830 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
831 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
834 SIMDBBOX_T
<SIMD_T
> bbox
;
838 goto endBinTriangles
;
841 // Calc bounding box of triangles
842 calcBoundingBoxIntVertical
<SIMD_T
, CT
>(vXi
, vYi
, bbox
);
844 // determine if triangle falls between pixel centers and discard
845 // only discard for non-MSAA case and when conservative rast is disabled
846 // (xmin + 127) & ~255
847 // (xmax + 128) & ~255
848 if ((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
849 (!CT::IsConservativeT::value
))
851 origTriMask
= triMask
;
856 Integer
<SIMD_T
> xmin
= SIMD_T::add_epi32(bbox
.xmin
, SIMD_T::set1_epi32(127));
857 xmin
= SIMD_T::and_si(xmin
, SIMD_T::set1_epi32(~255));
858 Integer
<SIMD_T
> xmax
= SIMD_T::add_epi32(bbox
.xmax
, SIMD_T::set1_epi32(128));
859 xmax
= SIMD_T::and_si(xmax
, SIMD_T::set1_epi32(~255));
861 Integer
<SIMD_T
> vMaskH
= SIMD_T::cmpeq_epi32(xmin
, xmax
);
863 Integer
<SIMD_T
> ymin
= SIMD_T::add_epi32(bbox
.ymin
, SIMD_T::set1_epi32(127));
864 ymin
= SIMD_T::and_si(ymin
, SIMD_T::set1_epi32(~255));
865 Integer
<SIMD_T
> ymax
= SIMD_T::add_epi32(bbox
.ymax
, SIMD_T::set1_epi32(128));
866 ymax
= SIMD_T::and_si(ymax
, SIMD_T::set1_epi32(~255));
868 Integer
<SIMD_T
> vMaskV
= SIMD_T::cmpeq_epi32(ymin
, ymax
);
870 vMaskV
= SIMD_T::or_si(vMaskH
, vMaskV
);
871 cullCenterMask
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV
));
874 triMask
&= ~cullCenterMask
;
876 if (origTriMask
^ triMask
)
878 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
882 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
883 // Gather the AOS effective scissor rects based on the per-prim VP index.
884 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
886 Integer
<SIMD_T
> scisXmin
, scisYmin
, scisXmax
, scisYmax
;
887 if (pa
.viewportArrayActive
)
890 GatherScissors(&state
.scissorsInFixedPoint
[0], pViewportIndex
, scisXmin
, scisYmin
, scisXmax
, scisYmax
);
892 else // broadcast fast path for non-VPAI case.
894 scisXmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
895 scisYmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
896 scisXmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
897 scisYmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
900 // Make triangle bbox inclusive
901 bbox
.xmax
= SIMD_T::sub_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1));
902 bbox
.ymax
= SIMD_T::sub_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1));
904 bbox
.xmin
= SIMD_T::max_epi32(bbox
.xmin
, scisXmin
);
905 bbox
.ymin
= SIMD_T::max_epi32(bbox
.ymin
, scisYmin
);
906 bbox
.xmax
= SIMD_T::min_epi32(bbox
.xmax
, scisXmax
);
907 bbox
.ymax
= SIMD_T::min_epi32(bbox
.ymax
, scisYmax
);
910 if (CT::IsConservativeT::value
)
912 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
913 // some area. Bump the xmax/ymax edges out
915 Integer
<SIMD_T
> topEqualsBottom
= SIMD_T::cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
916 bbox
.ymax
= SIMD_T::blendv_epi32(bbox
.ymax
, SIMD_T::add_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1)), topEqualsBottom
);
918 Integer
<SIMD_T
> leftEqualsRight
= SIMD_T::cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
919 bbox
.xmax
= SIMD_T::blendv_epi32(bbox
.xmax
, SIMD_T::add_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1)), leftEqualsRight
);
922 // Cull tris completely outside scissor
924 Integer
<SIMD_T
> maskOutsideScissorX
= SIMD_T::cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
925 Integer
<SIMD_T
> maskOutsideScissorY
= SIMD_T::cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
926 Integer
<SIMD_T
> maskOutsideScissorXY
= SIMD_T::or_si(maskOutsideScissorX
, maskOutsideScissorY
);
927 uint32_t maskOutsideScissor
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY
));
928 triMask
= triMask
& ~maskOutsideScissor
;
931 #if KNOB_ENABLE_EARLY_RAST
932 if (rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&& !CT::IsConservativeT::value
)
934 // Try early rasterization - culling small triangles which do not cover any pixels
936 // convert to ER tiles
937 SIMDBBOX_T
<SIMD_T
> er_bbox
;
939 er_bbox
.xmin
= SIMD_T::template srai_epi32
<ER_SIMD_TILE_X_SHIFT
+ FIXED_POINT_SHIFT
>(bbox
.xmin
);
940 er_bbox
.xmax
= SIMD_T::template srai_epi32
<ER_SIMD_TILE_X_SHIFT
+ FIXED_POINT_SHIFT
>(bbox
.xmax
);
941 er_bbox
.ymin
= SIMD_T::template srai_epi32
<ER_SIMD_TILE_Y_SHIFT
+ FIXED_POINT_SHIFT
>(bbox
.ymin
);
942 er_bbox
.ymax
= SIMD_T::template srai_epi32
<ER_SIMD_TILE_Y_SHIFT
+ FIXED_POINT_SHIFT
>(bbox
.ymax
);
944 Integer
<SIMD_T
> vTileX
= SIMD_T::cmpeq_epi32(er_bbox
.xmin
, er_bbox
.xmax
);
945 Integer
<SIMD_T
> vTileY
= SIMD_T::cmpeq_epi32(er_bbox
.ymin
, er_bbox
.ymax
);
947 // Take only triangles that fit into ER tile
948 uint32_t oneTileMask
= triMask
& SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX
, vTileY
)));
952 // determine CW tris (det > 0)
953 uint32_t maskCwLo
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet
[0], SIMD_T::setzero_si())));
954 uint32_t maskCwHi
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet
[1], SIMD_T::setzero_si())));
955 uint32_t cwTrisMask
= maskCwLo
| (maskCwHi
<< (SIMD_WIDTH
/ 2));
957 // Try early rasterization
958 triMask
= EarlyRasterizer
<SIMD_T
, SIMD_WIDTH
, CT
>(er_bbox
, vAi
, vBi
, vXi
, vYi
, cwTrisMask
, triMask
, oneTileMask
);
962 RDTSC_END(FEBinTriangles
, 1);
973 // Send surviving triangles to the line or point binner based on fill mode
974 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
976 // Simple non-conformant wireframe mode, useful for debugging
977 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
978 Vec4
<SIMD_T
> line
[2];
979 Float
<SIMD_T
> recipW
[2];
983 recipW
[0] = vRecipW0
;
984 recipW
[1] = vRecipW1
;
986 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
, rtIdx
);
990 recipW
[0] = vRecipW1
;
991 recipW
[1] = vRecipW2
;
993 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
, rtIdx
);
997 recipW
[0] = vRecipW2
;
998 recipW
[1] = vRecipW0
;
1000 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
, rtIdx
);
1002 RDTSC_END(FEBinTriangles
, 1);
1005 else if (rastState
.fillMode
== SWR_FILLMODE_POINT
)
1008 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, &tri
[0], triMask
, primID
, viewportIdx
, rtIdx
);
1009 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, &tri
[1], triMask
, primID
, viewportIdx
, rtIdx
);
1010 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, &tri
[2], triMask
, primID
, viewportIdx
, rtIdx
);
1012 RDTSC_END(FEBinTriangles
, 1);
1016 // Convert triangle bbox to macrotile units.
1017 bbox
.xmin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmin
);
1018 bbox
.ymin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymin
);
1019 bbox
.xmax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmax
);
1020 bbox
.ymax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymax
);
1022 OSALIGNSIMD16(uint32_t) aMTLeft
[SIMD_WIDTH
], aMTRight
[SIMD_WIDTH
], aMTTop
[SIMD_WIDTH
], aMTBottom
[SIMD_WIDTH
];
1024 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTLeft
), bbox
.xmin
);
1025 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTRight
), bbox
.xmax
);
1026 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTTop
), bbox
.ymin
);
1027 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTBottom
), bbox
.ymax
);
1029 // transpose verts needed for backend
1030 /// @todo modify BE to take non-transformed verts
1031 OSALIGNSIMD16(simd4scalar
) vHorizX
[SIMD_WIDTH
];
1032 OSALIGNSIMD16(simd4scalar
) vHorizY
[SIMD_WIDTH
];
1033 OSALIGNSIMD16(simd4scalar
) vHorizZ
[SIMD_WIDTH
];
1034 OSALIGNSIMD16(simd4scalar
) vHorizW
[SIMD_WIDTH
];
1036 TransposeVertices(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
1037 TransposeVertices(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
1038 TransposeVertices(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
1039 TransposeVertices(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
1041 // scan remaining valid triangles and bin each separately
1042 while (_BitScanForward(&triIndex
, triMask
))
1044 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1045 uint32_t numScalarAttribs
= linkageCount
* 4;
1051 if (CT::IsConservativeT::value
)
1053 // only rasterize valid edges if we have a degenerate primitive
1054 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
1055 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1056 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
1058 // Degenerate triangles are required to be constant interpolated
1059 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
1063 isDegenerate
= false;
1064 work
.pfnWork
= pfnWork
;
1067 // Select attribute processor
1068 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
1069 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
1071 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1073 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
1074 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
1075 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
1077 auto pArena
= pDC
->pArena
;
1078 SWR_ASSERT(pArena
!= nullptr);
1080 // store active attribs
1081 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1082 desc
.pAttribs
= pAttribs
;
1083 desc
.numAttribs
= linkageCount
;
1084 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
1086 // store triangle vertex data
1087 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1089 SIMD128::store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
1090 SIMD128::store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
1091 SIMD128::store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
1092 SIMD128::store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
1094 // store user clip distances
1095 if (state
.backendState
.clipDistanceMask
)
1097 uint32_t numClipDist
= _mm_popcnt_u32(state
.backendState
.clipDistanceMask
);
1098 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1099 ProcessUserClipDist
<3>(state
.backendState
, pa
, triIndex
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1102 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
1104 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
1106 #if KNOB_ENABLE_TOSS_POINTS
1107 if (!KNOB_TOSS_SETUP_TRIS
)
1110 pTileMgr
->enqueue(x
, y
, &work
);
1115 triMask
&= ~(1 << triIndex
);
1118 RDTSC_END(FEBinTriangles
, 1);
1121 template <typename CT
>
1128 simdscalari
const &primID
,
1129 simdscalari
const &viewportIdx
,
1130 simdscalari
const &rtIdx
)
1132 BinTrianglesImpl
<SIMD256
, KNOB_SIMD_WIDTH
, CT
>(pDC
, pa
, workerId
, tri
, triMask
, primID
, viewportIdx
, rtIdx
);
1135 #if USE_SIMD16_FRONTEND
1136 template <typename CT
>
1137 void SIMDCALL
BinTriangles_simd16(
1141 simd16vector tri
[3],
1143 simd16scalari
const &primID
,
1144 simd16scalari
const &viewportIdx
,
1145 simd16scalari
const &rtIdx
)
1147 BinTrianglesImpl
<SIMD512
, KNOB_SIMD16_WIDTH
, CT
>(pDC
, pa
, workerId
, tri
, triMask
, primID
, viewportIdx
, rtIdx
);
1151 struct FEBinTrianglesChooser
1153 typedef PFN_PROCESS_PRIMS FuncType
;
1155 template <typename
... ArgsB
>
1156 static FuncType
GetFunc()
1158 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
1162 // Selector for correct templated BinTrinagles function
1163 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
1165 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
1168 #if USE_SIMD16_FRONTEND
1169 struct FEBinTrianglesChooser_simd16
1171 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType
;
1173 template <typename
... ArgsB
>
1174 static FuncType
GetFunc()
1176 return BinTriangles_simd16
<ConservativeRastFETraits
<ArgsB
...>>;
1180 // Selector for correct templated BinTrinagles function
1181 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
)
1183 return TemplateArgUnroller
<FEBinTrianglesChooser_simd16
>::GetFunc(IsConservative
);
1188 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1189 void BinPostSetupPointsImpl(
1193 Vec4
<SIMD_T
> prim
[],
1195 Integer
<SIMD_T
> const &primID
,
1196 Integer
<SIMD_T
> const &viewportIdx
,
1197 Integer
<SIMD_T
> const &rtIdx
)
1199 RDTSC_BEGIN(FEBinPoints
, pDC
->drawId
);
1201 Vec4
<SIMD_T
> &primVerts
= prim
[0];
1203 const API_STATE
& state
= GetApiState(pDC
);
1204 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1205 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1207 // Select attribute processor
1208 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1209 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1211 // convert to fixed point
1212 Integer
<SIMD_T
> vXi
, vYi
;
1214 vXi
= fpToFixedPointVertical
<SIMD_T
>(primVerts
.x
);
1215 vYi
= fpToFixedPointVertical
<SIMD_T
>(primVerts
.y
);
1217 if (CanUseSimplePoints(pDC
))
1219 // adjust for ymin-xmin rule
1220 vXi
= SIMD_T::sub_epi32(vXi
, SIMD_T::set1_epi32(1));
1221 vYi
= SIMD_T::sub_epi32(vYi
, SIMD_T::set1_epi32(1));
1223 // cull points off the ymin-xmin edge of the viewport
1224 primMask
&= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi
));
1225 primMask
&= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi
));
1227 // compute macro tile coordinates
1228 Integer
<SIMD_T
> macroX
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(vXi
);
1229 Integer
<SIMD_T
> macroY
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(vYi
);
1231 OSALIGNSIMD16(uint32_t) aMacroX
[SIMD_WIDTH
], aMacroY
[SIMD_WIDTH
];
1233 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMacroX
), macroX
);
1234 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMacroY
), macroY
);
1236 // compute raster tile coordinates
1237 Integer
<SIMD_T
> rasterX
= SIMD_T::template srai_epi32
<KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
>(vXi
);
1238 Integer
<SIMD_T
> rasterY
= SIMD_T::template srai_epi32
<KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
>(vYi
);
1240 // compute raster tile relative x,y for coverage mask
1241 Integer
<SIMD_T
> tileAlignedX
= SIMD_T::template slli_epi32
<KNOB_TILE_X_DIM_SHIFT
>(rasterX
);
1242 Integer
<SIMD_T
> tileAlignedY
= SIMD_T::template slli_epi32
<KNOB_TILE_Y_DIM_SHIFT
>(rasterY
);
1244 Integer
<SIMD_T
> tileRelativeX
= SIMD_T::sub_epi32(SIMD_T::template srai_epi32
<FIXED_POINT_SHIFT
>(vXi
), tileAlignedX
);
1245 Integer
<SIMD_T
> tileRelativeY
= SIMD_T::sub_epi32(SIMD_T::template srai_epi32
<FIXED_POINT_SHIFT
>(vYi
), tileAlignedY
);
1247 OSALIGNSIMD16(uint32_t) aTileRelativeX
[SIMD_WIDTH
];
1248 OSALIGNSIMD16(uint32_t) aTileRelativeY
[SIMD_WIDTH
];
1250 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aTileRelativeX
), tileRelativeX
);
1251 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aTileRelativeY
), tileRelativeY
);
1253 OSALIGNSIMD16(uint32_t) aTileAlignedX
[SIMD_WIDTH
];
1254 OSALIGNSIMD16(uint32_t) aTileAlignedY
[SIMD_WIDTH
];
1256 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aTileAlignedX
), tileAlignedX
);
1257 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aTileAlignedY
), tileAlignedY
);
1259 OSALIGNSIMD16(float) aZ
[SIMD_WIDTH
];
1260 SIMD_T::store_ps(reinterpret_cast<float *>(aZ
), primVerts
.z
);
1262 // store render target array index
1263 const uint32_t *aRTAI
= reinterpret_cast<const uint32_t *>(&rtIdx
);
1265 uint32_t *pPrimID
= (uint32_t *)&primID
;
1266 DWORD primIndex
= 0;
1268 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1270 // scan remaining valid triangles and bin each separately
1271 while (_BitScanForward(&primIndex
, primMask
))
1273 uint32_t linkageCount
= backendState
.numAttributes
;
1274 uint32_t numScalarAttribs
= linkageCount
* 4;
1279 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1281 // points are always front facing
1282 desc
.triFlags
.frontFacing
= 1;
1283 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1284 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1286 work
.pfnWork
= RasterizeSimplePoint
;
1288 auto pArena
= pDC
->pArena
;
1289 SWR_ASSERT(pArena
!= nullptr);
1292 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1293 desc
.pAttribs
= pAttribs
;
1294 desc
.numAttribs
= linkageCount
;
1296 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1298 // store raster tile aligned x, y, perspective correct z
1299 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1300 desc
.pTriBuffer
= pTriBuffer
;
1301 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1302 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1303 *pTriBuffer
= aZ
[primIndex
];
1305 uint32_t tX
= aTileRelativeX
[primIndex
];
1306 uint32_t tY
= aTileRelativeY
[primIndex
];
1308 // pack the relative x,y into the coverageMask, the rasterizer will
1309 // generate the true coverage mask from it
1310 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1313 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1314 #if KNOB_ENABLE_TOSS_POINTS
1315 if (!KNOB_TOSS_SETUP_TRIS
)
1318 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1321 primMask
&= ~(1 << primIndex
);
1326 // non simple points need to be potentially binned to multiple macro tiles
1327 Float
<SIMD_T
> vPointSize
;
1329 if (rastState
.pointParam
)
1331 Vec4
<SIMD_T
> size
[3];
1332 pa
.Assemble(VERTEX_SGV_SLOT
, size
);
1333 vPointSize
= size
[0][VERTEX_SGV_POINT_SIZE_COMP
];
1337 vPointSize
= SIMD_T::set1_ps(rastState
.pointSize
);
1340 // bloat point to bbox
1341 SIMDBBOX_T
<SIMD_T
> bbox
;
1343 bbox
.xmin
= bbox
.xmax
= vXi
;
1344 bbox
.ymin
= bbox
.ymax
= vYi
;
1346 Float
<SIMD_T
> vHalfWidth
= SIMD_T::mul_ps(vPointSize
, SIMD_T::set1_ps(0.5f
));
1347 Integer
<SIMD_T
> vHalfWidthi
= fpToFixedPointVertical
<SIMD_T
>(vHalfWidth
);
1349 bbox
.xmin
= SIMD_T::sub_epi32(bbox
.xmin
, vHalfWidthi
);
1350 bbox
.xmax
= SIMD_T::add_epi32(bbox
.xmax
, vHalfWidthi
);
1351 bbox
.ymin
= SIMD_T::sub_epi32(bbox
.ymin
, vHalfWidthi
);
1352 bbox
.ymax
= SIMD_T::add_epi32(bbox
.ymax
, vHalfWidthi
);
1354 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1355 // Gather the AOS effective scissor rects based on the per-prim VP index.
1356 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1358 Integer
<SIMD_T
> scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1360 if (pa
.viewportArrayActive
)
1362 GatherScissors(&state
.scissorsInFixedPoint
[0], pViewportIndex
, scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1364 else // broadcast fast path for non-VPAI case.
1366 scisXmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1367 scisYmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1368 scisXmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1369 scisYmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1372 bbox
.xmin
= SIMD_T::max_epi32(bbox
.xmin
, scisXmin
);
1373 bbox
.ymin
= SIMD_T::max_epi32(bbox
.ymin
, scisYmin
);
1374 bbox
.xmax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1)), scisXmax
);
1375 bbox
.ymax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1)), scisYmax
);
1378 // Cull bloated points completely outside scissor
1379 Integer
<SIMD_T
> maskOutsideScissorX
= SIMD_T::cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1380 Integer
<SIMD_T
> maskOutsideScissorY
= SIMD_T::cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1381 Integer
<SIMD_T
> maskOutsideScissorXY
= SIMD_T::or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1382 uint32_t maskOutsideScissor
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY
));
1383 primMask
= primMask
& ~maskOutsideScissor
;
1385 // Convert bbox to macrotile units.
1386 bbox
.xmin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmin
);
1387 bbox
.ymin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymin
);
1388 bbox
.xmax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmax
);
1389 bbox
.ymax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymax
);
1391 OSALIGNSIMD16(uint32_t) aMTLeft
[SIMD_WIDTH
], aMTRight
[SIMD_WIDTH
], aMTTop
[SIMD_WIDTH
], aMTBottom
[SIMD_WIDTH
];
1393 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTLeft
), bbox
.xmin
);
1394 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTRight
), bbox
.xmax
);
1395 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTTop
), bbox
.ymin
);
1396 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTBottom
), bbox
.ymax
);
1398 // store render target array index
1399 const uint32_t *aRTAI
= reinterpret_cast<const uint32_t *>(&rtIdx
);
1401 OSALIGNSIMD16(float) aPointSize
[SIMD_WIDTH
];
1402 SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize
), vPointSize
);
1404 uint32_t *pPrimID
= (uint32_t *)&primID
;
1406 OSALIGNSIMD16(float) aPrimVertsX
[SIMD_WIDTH
];
1407 OSALIGNSIMD16(float) aPrimVertsY
[SIMD_WIDTH
];
1408 OSALIGNSIMD16(float) aPrimVertsZ
[SIMD_WIDTH
];
1410 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX
), primVerts
.x
);
1411 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY
), primVerts
.y
);
1412 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ
), primVerts
.z
);
1414 // scan remaining valid prims and bin each separately
1415 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1417 while (_BitScanForward(&primIndex
, primMask
))
1419 uint32_t linkageCount
= backendState
.numAttributes
;
1420 uint32_t numScalarAttribs
= linkageCount
* 4;
1425 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1427 desc
.triFlags
.frontFacing
= 1;
1428 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1429 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1430 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1432 work
.pfnWork
= RasterizeTriPoint
;
1434 auto pArena
= pDC
->pArena
;
1435 SWR_ASSERT(pArena
!= nullptr);
1437 // store active attribs
1438 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1439 desc
.numAttribs
= linkageCount
;
1440 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1442 // store point vertex data
1443 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1444 desc
.pTriBuffer
= pTriBuffer
;
1445 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
1446 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
1447 *pTriBuffer
= aPrimVertsZ
[primIndex
];
1449 // store user clip distances
1450 if (backendState
.clipDistanceMask
)
1452 uint32_t numClipDist
= _mm_popcnt_u32(backendState
.clipDistanceMask
);
1453 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1456 ProcessUserClipDist
<1>(backendState
, pa
, primIndex
, &one
, dists
);
1457 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
1458 desc
.pUserClipBuffer
[3 * i
+ 0] = 0.0f
;
1459 desc
.pUserClipBuffer
[3 * i
+ 1] = 0.0f
;
1460 desc
.pUserClipBuffer
[3 * i
+ 2] = dists
[i
];
1464 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1465 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1467 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1469 #if KNOB_ENABLE_TOSS_POINTS
1470 if (!KNOB_TOSS_SETUP_TRIS
)
1473 pTileMgr
->enqueue(x
, y
, &work
);
1478 primMask
&= ~(1 << primIndex
);
1482 RDTSC_END(FEBinPoints
, 1);
1485 //////////////////////////////////////////////////////////////////////////
1486 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1487 /// @param pDC - pointer to draw context.
1488 /// @param pa - The primitive assembly object.
1489 /// @param workerId - thread's worker id. Even thread has a unique id.
1490 /// @param tri - Contains point position data for SIMDs worth of points.
1491 /// @param primID - Primitive ID for each point.
1492 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1497 Vec4
<SIMD_T
> prim
[3],
1499 Integer
<SIMD_T
> const &primID
,
1500 Integer
<SIMD_T
> const &viewportIdx
,
1501 Integer
<SIMD_T
> const &rtIdx
)
1503 const API_STATE
& state
= GetApiState(pDC
);
1504 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1505 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1507 if (!feState
.vpTransformDisable
)
1509 // perspective divide
1510 Float
<SIMD_T
> vRecipW0
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), prim
[0].w
);
1512 prim
[0].x
= SIMD_T::mul_ps(prim
[0].x
, vRecipW0
);
1513 prim
[0].y
= SIMD_T::mul_ps(prim
[0].y
, vRecipW0
);
1514 prim
[0].z
= SIMD_T::mul_ps(prim
[0].z
, vRecipW0
);
1516 // viewport transform to screen coords
1517 if (pa
.viewportArrayActive
)
1519 viewportTransform
<1>(prim
, state
.vpMatrices
, viewportIdx
);
1523 viewportTransform
<1>(prim
, state
.vpMatrices
);
1527 Float
<SIMD_T
> offset
= SwrPixelOffsets
<SIMD_T
>::GetOffset(rastState
.pixelLocation
);
1529 prim
[0].x
= SIMD_T::add_ps(prim
[0].x
, offset
);
1530 prim
[0].y
= SIMD_T::add_ps(prim
[0].y
, offset
);
1532 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(
1549 simdscalari
const &primID
,
1550 simdscalari
const &viewportIdx
,
1551 simdscalari
const &rtIdx
)
1553 BinPointsImpl
<SIMD256
, KNOB_SIMD_WIDTH
>(
1564 #if USE_SIMD16_FRONTEND
1565 void SIMDCALL
BinPoints_simd16(
1569 simd16vector prim
[3],
1571 simd16scalari
const &primID
,
1572 simd16scalari
const &viewportIdx
,
1573 simd16scalari
const & rtIdx
)
1575 BinPointsImpl
<SIMD512
, KNOB_SIMD16_WIDTH
>(
1587 //////////////////////////////////////////////////////////////////////////
1588 /// @brief Bin SIMD lines to the backend.
1589 /// @param pDC - pointer to draw context.
1590 /// @param pa - The primitive assembly object.
1591 /// @param workerId - thread's worker id. Even thread has a unique id.
1592 /// @param tri - Contains line position data for SIMDs worth of points.
1593 /// @param primID - Primitive ID for each line.
1594 /// @param viewportIdx - Viewport Array Index for each line.
1595 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1596 void BinPostSetupLinesImpl(
1600 Vec4
<SIMD_T
> prim
[],
1601 Float
<SIMD_T
> recipW
[],
1603 Integer
<SIMD_T
> const &primID
,
1604 Integer
<SIMD_T
> const &viewportIdx
,
1605 Integer
<SIMD_T
> const &rtIdx
)
1607 const uint32_t *aRTAI
= reinterpret_cast<const uint32_t *>(&rtIdx
);
1609 RDTSC_BEGIN(FEBinLines
, pDC
->drawId
);
1611 const API_STATE
&state
= GetApiState(pDC
);
1612 const SWR_RASTSTATE
&rastState
= state
.rastState
;
1614 // Select attribute processor
1615 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
1616 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1618 Float
<SIMD_T
> &vRecipW0
= recipW
[0];
1619 Float
<SIMD_T
> &vRecipW1
= recipW
[1];
1621 // convert to fixed point
1622 Integer
<SIMD_T
> vXi
[2], vYi
[2];
1624 vXi
[0] = fpToFixedPointVertical
<SIMD_T
>(prim
[0].x
);
1625 vYi
[0] = fpToFixedPointVertical
<SIMD_T
>(prim
[0].y
);
1626 vXi
[1] = fpToFixedPointVertical
<SIMD_T
>(prim
[1].x
);
1627 vYi
[1] = fpToFixedPointVertical
<SIMD_T
>(prim
[1].y
);
1629 // compute x-major vs y-major mask
1630 Integer
<SIMD_T
> xLength
= SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi
[0], vXi
[1]));
1631 Integer
<SIMD_T
> yLength
= SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi
[0], vYi
[1]));
1632 Float
<SIMD_T
> vYmajorMask
= SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength
, xLength
));
1633 uint32_t yMajorMask
= SIMD_T::movemask_ps(vYmajorMask
);
1635 // cull zero-length lines
1636 Integer
<SIMD_T
> vZeroLengthMask
= SIMD_T::cmpeq_epi32(xLength
, SIMD_T::setzero_si());
1637 vZeroLengthMask
= SIMD_T::and_si(vZeroLengthMask
, SIMD_T::cmpeq_epi32(yLength
, SIMD_T::setzero_si()));
1639 primMask
&= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask
));
1641 uint32_t *pPrimID
= (uint32_t *)&primID
;
1642 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1644 // Calc bounding box of lines
1645 SIMDBBOX_T
<SIMD_T
> bbox
;
1646 bbox
.xmin
= SIMD_T::min_epi32(vXi
[0], vXi
[1]);
1647 bbox
.xmax
= SIMD_T::max_epi32(vXi
[0], vXi
[1]);
1648 bbox
.ymin
= SIMD_T::min_epi32(vYi
[0], vYi
[1]);
1649 bbox
.ymax
= SIMD_T::max_epi32(vYi
[0], vYi
[1]);
1651 // bloat bbox by line width along minor axis
1652 Float
<SIMD_T
> vHalfWidth
= SIMD_T::set1_ps(rastState
.lineWidth
/ 2.0f
);
1653 Integer
<SIMD_T
> vHalfWidthi
= fpToFixedPointVertical
<SIMD_T
>(vHalfWidth
);
1655 SIMDBBOX_T
<SIMD_T
> bloatBox
;
1657 bloatBox
.xmin
= SIMD_T::sub_epi32(bbox
.xmin
, vHalfWidthi
);
1658 bloatBox
.xmax
= SIMD_T::add_epi32(bbox
.xmax
, vHalfWidthi
);
1659 bloatBox
.ymin
= SIMD_T::sub_epi32(bbox
.ymin
, vHalfWidthi
);
1660 bloatBox
.ymax
= SIMD_T::add_epi32(bbox
.ymax
, vHalfWidthi
);
1662 bbox
.xmin
= SIMD_T::blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
1663 bbox
.xmax
= SIMD_T::blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
1664 bbox
.ymin
= SIMD_T::blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
1665 bbox
.ymax
= SIMD_T::blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
1667 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1669 Integer
<SIMD_T
> scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1671 if (pa
.viewportArrayActive
)
1673 GatherScissors(&state
.scissorsInFixedPoint
[0], pViewportIndex
, scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1675 else // broadcast fast path for non-VPAI case.
1677 scisXmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1678 scisYmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1679 scisXmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1680 scisYmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1683 bbox
.xmin
= SIMD_T::max_epi32(bbox
.xmin
, scisXmin
);
1684 bbox
.ymin
= SIMD_T::max_epi32(bbox
.ymin
, scisYmin
);
1685 bbox
.xmax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1)), scisXmax
);
1686 bbox
.ymax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1)), scisYmax
);
1689 // Cull prims completely outside scissor
1691 Integer
<SIMD_T
> maskOutsideScissorX
= SIMD_T::cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1692 Integer
<SIMD_T
> maskOutsideScissorY
= SIMD_T::cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1693 Integer
<SIMD_T
> maskOutsideScissorXY
= SIMD_T::or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1694 uint32_t maskOutsideScissor
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY
));
1695 primMask
= primMask
& ~maskOutsideScissor
;
1698 // transpose verts needed for backend
1699 /// @todo modify BE to take non-transformed verts
1700 OSALIGNSIMD16(simd4scalar
) vHorizX
[SIMD_WIDTH
];
1701 OSALIGNSIMD16(simd4scalar
) vHorizY
[SIMD_WIDTH
];
1702 OSALIGNSIMD16(simd4scalar
) vHorizZ
[SIMD_WIDTH
];
1703 OSALIGNSIMD16(simd4scalar
) vHorizW
[SIMD_WIDTH
];
1710 // Convert triangle bbox to macrotile units.
1711 bbox
.xmin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmin
);
1712 bbox
.ymin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymin
);
1713 bbox
.xmax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmax
);
1714 bbox
.ymax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymax
);
1716 OSALIGNSIMD16(uint32_t) aMTLeft
[SIMD_WIDTH
], aMTRight
[SIMD_WIDTH
], aMTTop
[SIMD_WIDTH
], aMTBottom
[SIMD_WIDTH
];
1718 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTLeft
), bbox
.xmin
);
1719 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTRight
), bbox
.xmax
);
1720 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTTop
), bbox
.ymin
);
1721 SIMD_T::store_si(reinterpret_cast<Integer
<SIMD_T
> *>(aMTBottom
), bbox
.ymax
);
1723 TransposeVertices(vHorizX
, prim
[0].x
, prim
[1].x
, SIMD_T::setzero_ps());
1724 TransposeVertices(vHorizY
, prim
[0].y
, prim
[1].y
, SIMD_T::setzero_ps());
1725 TransposeVertices(vHorizZ
, prim
[0].z
, prim
[1].z
, SIMD_T::setzero_ps());
1726 TransposeVertices(vHorizW
, vRecipW0
, vRecipW1
, SIMD_T::setzero_ps());
1728 // scan remaining valid prims and bin each separately
1730 while (_BitScanForward(&primIndex
, primMask
))
1732 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1733 uint32_t numScalarAttribs
= linkageCount
* 4;
1738 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1740 desc
.triFlags
.frontFacing
= 1;
1741 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
1742 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1743 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1745 work
.pfnWork
= RasterizeLine
;
1747 auto pArena
= pDC
->pArena
;
1748 SWR_ASSERT(pArena
!= nullptr);
1750 // store active attribs
1751 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1752 desc
.numAttribs
= linkageCount
;
1753 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1755 // store line vertex data
1756 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1758 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
1759 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
1760 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
1761 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
1763 // store user clip distances
1764 if (state
.backendState
.clipDistanceMask
)
1766 uint32_t numClipDist
= _mm_popcnt_u32(state
.backendState
.clipDistanceMask
);
1767 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
1768 ProcessUserClipDist
<2>(state
.backendState
, pa
, primIndex
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1771 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1772 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1774 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1776 #if KNOB_ENABLE_TOSS_POINTS
1777 if (!KNOB_TOSS_SETUP_TRIS
)
1780 pTileMgr
->enqueue(x
, y
, &work
);
1785 primMask
&= ~(1 << primIndex
);
1790 RDTSC_END(FEBinLines
, 1);
1793 //////////////////////////////////////////////////////////////////////////
1794 /// @brief Bin SIMD lines to the backend.
1795 /// @param pDC - pointer to draw context.
1796 /// @param pa - The primitive assembly object.
1797 /// @param workerId - thread's worker id. Even thread has a unique id.
1798 /// @param tri - Contains line position data for SIMDs worth of points.
1799 /// @param primID - Primitive ID for each line.
1800 /// @param viewportIdx - Viewport Array Index for each line.
1801 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1802 void SIMDCALL
BinLinesImpl(
1806 Vec4
<SIMD_T
> prim
[3],
1808 Integer
<SIMD_T
> const &primID
,
1809 Integer
<SIMD_T
> const &viewportIdx
,
1810 Integer
<SIMD_T
> const & rtIdx
)
1812 const API_STATE
& state
= GetApiState(pDC
);
1813 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1814 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1816 Float
<SIMD_T
> vRecipW
[2] = { SIMD_T::set1_ps(1.0f
), SIMD_T::set1_ps(1.0f
) };
1818 if (!feState
.vpTransformDisable
)
1820 // perspective divide
1821 vRecipW
[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), prim
[0].w
);
1822 vRecipW
[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), prim
[1].w
);
1824 prim
[0].v
[0] = SIMD_T::mul_ps(prim
[0].v
[0], vRecipW
[0]);
1825 prim
[1].v
[0] = SIMD_T::mul_ps(prim
[1].v
[0], vRecipW
[1]);
1827 prim
[0].v
[1] = SIMD_T::mul_ps(prim
[0].v
[1], vRecipW
[0]);
1828 prim
[1].v
[1] = SIMD_T::mul_ps(prim
[1].v
[1], vRecipW
[1]);
1830 prim
[0].v
[2] = SIMD_T::mul_ps(prim
[0].v
[2], vRecipW
[0]);
1831 prim
[1].v
[2] = SIMD_T::mul_ps(prim
[1].v
[2], vRecipW
[1]);
1833 // viewport transform to screen coords
1834 if (pa
.viewportArrayActive
)
1836 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
1840 viewportTransform
<2>(prim
, state
.vpMatrices
);
1844 // adjust for pixel center location
1845 Float
<SIMD_T
> offset
= SwrPixelOffsets
<SIMD_T
>::GetOffset(rastState
.pixelLocation
);
1847 prim
[0].x
= SIMD_T::add_ps(prim
[0].x
, offset
);
1848 prim
[0].y
= SIMD_T::add_ps(prim
[0].y
, offset
);
1850 prim
[1].x
= SIMD_T::add_ps(prim
[1].x
, offset
);
1851 prim
[1].y
= SIMD_T::add_ps(prim
[1].y
, offset
);
1853 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(
1871 simdscalari
const &primID
,
1872 simdscalari
const &viewportIdx
,
1873 simdscalari
const &rtIdx
)
1875 BinLinesImpl
<SIMD256
, KNOB_SIMD_WIDTH
>(pDC
, pa
, workerId
, prim
, primMask
, primID
, viewportIdx
, rtIdx
);
1878 #if USE_SIMD16_FRONTEND
1879 void SIMDCALL
BinLines_simd16(
1883 simd16vector prim
[3],
1885 simd16scalari
const &primID
,
1886 simd16scalari
const &viewportIdx
,
1887 simd16scalari
const &rtIdx
)
1889 BinLinesImpl
<SIMD512
, KNOB_SIMD16_WIDTH
>(pDC
, pa
, workerId
, prim
, primMask
, primID
, viewportIdx
, rtIdx
);