1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for the macrotile binner
27 ******************************************************************************/
32 #include "conservativeRast.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
39 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
40 void BinPostSetupLinesImpl(
44 typename
SIMD_T::Vec4 prim
[],
45 typename
SIMD_T::Float recipW
[],
47 typename
SIMD_T::Integer
const &primID
,
48 typename
SIMD_T::Integer
const &viewportIdx
);
50 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
51 void BinPostSetupPointsImpl(
55 typename
SIMD_T::Vec4 prim
[],
57 typename
SIMD_T::Integer
const &primID
,
58 typename
SIMD_T::Integer
const &viewportIdx
);
60 //////////////////////////////////////////////////////////////////////////
61 /// @brief Processes attributes for the backend based on linkage mask and
62 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
63 /// @param pDC - Draw context
64 /// @param pa - Primitive Assembly state
65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
66 /// @param pLinkageMap - maps VS attribute slot to PS slot
67 /// @param triIndex - Triangle to process attributes for
68 /// @param pBuffer - Output result
69 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
70 INLINE
void ProcessAttributes(
77 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
78 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
79 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
80 uint32_t constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
81 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
82 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
84 static const float constTable
[3][4] = {
85 { 0.0f
, 0.0f
, 0.0f
, 0.0f
},
86 { 0.0f
, 0.0f
, 0.0f
, 1.0f
},
87 { 1.0f
, 1.0f
, 1.0f
, 1.0f
}
90 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
93 if (IsSwizzledT::value
)
95 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
96 inputSlot
= backendState
.vertexAttribOffset
+ attribSwizzle
.sourceAttrib
;
101 inputSlot
= backendState
.vertexAttribOffset
+ i
;
104 simd4scalar attrib
[3]; // triangle attribs (always 4 wide)
105 float* pAttribStart
= pBuffer
;
107 if (HasConstantInterpT::value
|| IsDegenerate::value
)
109 if (CheckBit(constantInterpMask
, i
))
112 uint32_t adjustedTriIndex
;
113 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
114 static const int32_t quadProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
115 static const uint32_t quadProvokingVertex
[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
116 static const int32_t qstripProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
117 static const uint32_t qstripProvokingVertex
[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
121 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
122 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
125 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
126 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
128 case TOP_TRIANGLE_STRIP
:
129 adjustedTriIndex
= triIndex
;
131 ? tristripProvokingVertex
[provokingVertex
]
135 adjustedTriIndex
= triIndex
;
136 vid
= provokingVertex
;
140 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
142 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
144 SIMD128::store_ps(pBuffer
, attrib
[vid
]);
150 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
152 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
154 SIMD128::store_ps(pBuffer
, attrib
[i
]);
161 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
163 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
165 SIMD128::store_ps(pBuffer
, attrib
[i
]);
170 // pad out the attrib buffer to 3 verts to ensure the triangle
171 // interpolation code in the pixel shader works correctly for the
172 // 3 topologies - point, line, tri. This effectively zeros out the
173 // effect of the missing vertices in the triangle interpolation.
174 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
176 SIMD128::store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
180 // check for constant source overrides
181 if (IsSwizzledT::value
)
183 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
187 while (_BitScanForward(&comp
, mask
))
189 mask
&= ~(1 << comp
);
191 float constantValue
= 0.0f
;
192 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
194 case SWR_CONSTANT_SOURCE_CONST_0000
:
195 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
196 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
197 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
199 case SWR_CONSTANT_SOURCE_PRIM_ID
:
200 constantValue
= *(float*)&primId
;
204 // apply constant value to all 3 vertices
205 for (uint32_t v
= 0; v
< 3; ++v
)
207 pAttribStart
[comp
+ v
* 4] = constantValue
;
215 //////////////////////////////////////////////////////////////////////////
216 /// @brief Gather scissor rect data based on per-prim viewport indices.
217 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
218 /// @param pViewportIndex - array of per-primitive vewport indexes.
219 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
220 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
221 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
222 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
224 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
225 static void GatherScissors(const SWR_RECT
*pScissorsInFixedPoint
, const uint32_t *pViewportIndex
,
226 simdscalari
&scisXmin
, simdscalari
&scisYmin
, simdscalari
&scisXmax
, simdscalari
&scisYmax
)
228 scisXmin
= _simd_set_epi32(
229 pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
230 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
231 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
232 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
233 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
234 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
235 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
236 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
);
237 scisYmin
= _simd_set_epi32(
238 pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
239 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
240 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
241 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
242 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
243 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
244 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
245 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
);
246 scisXmax
= _simd_set_epi32(
247 pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
248 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
249 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
250 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
251 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
252 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
253 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
254 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
);
255 scisYmax
= _simd_set_epi32(
256 pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
257 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
258 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
259 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
260 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
261 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
262 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
263 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
);
266 static void GatherScissors(const SWR_RECT
*pScissorsInFixedPoint
, const uint32_t *pViewportIndex
,
267 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
, simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
269 scisXmin
= _simd16_set_epi32(
270 pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
271 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
272 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
273 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
274 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
275 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
276 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
277 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
,
278 pScissorsInFixedPoint
[pViewportIndex
[8]].xmin
,
279 pScissorsInFixedPoint
[pViewportIndex
[9]].xmin
,
280 pScissorsInFixedPoint
[pViewportIndex
[10]].xmin
,
281 pScissorsInFixedPoint
[pViewportIndex
[11]].xmin
,
282 pScissorsInFixedPoint
[pViewportIndex
[12]].xmin
,
283 pScissorsInFixedPoint
[pViewportIndex
[13]].xmin
,
284 pScissorsInFixedPoint
[pViewportIndex
[14]].xmin
,
285 pScissorsInFixedPoint
[pViewportIndex
[15]].xmin
);
287 scisYmin
= _simd16_set_epi32(
288 pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
289 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
290 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
291 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
292 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
293 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
294 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
295 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
,
296 pScissorsInFixedPoint
[pViewportIndex
[8]].ymin
,
297 pScissorsInFixedPoint
[pViewportIndex
[9]].ymin
,
298 pScissorsInFixedPoint
[pViewportIndex
[10]].ymin
,
299 pScissorsInFixedPoint
[pViewportIndex
[11]].ymin
,
300 pScissorsInFixedPoint
[pViewportIndex
[12]].ymin
,
301 pScissorsInFixedPoint
[pViewportIndex
[13]].ymin
,
302 pScissorsInFixedPoint
[pViewportIndex
[14]].ymin
,
303 pScissorsInFixedPoint
[pViewportIndex
[15]].ymin
);
305 scisXmax
= _simd16_set_epi32(
306 pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
307 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
308 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
309 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
310 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
311 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
312 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
313 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
,
314 pScissorsInFixedPoint
[pViewportIndex
[8]].xmax
,
315 pScissorsInFixedPoint
[pViewportIndex
[9]].xmax
,
316 pScissorsInFixedPoint
[pViewportIndex
[10]].xmax
,
317 pScissorsInFixedPoint
[pViewportIndex
[11]].xmax
,
318 pScissorsInFixedPoint
[pViewportIndex
[12]].xmax
,
319 pScissorsInFixedPoint
[pViewportIndex
[13]].xmax
,
320 pScissorsInFixedPoint
[pViewportIndex
[14]].xmax
,
321 pScissorsInFixedPoint
[pViewportIndex
[15]].xmax
);
323 scisYmax
= _simd16_set_epi32(
324 pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
325 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
326 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
327 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
328 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
329 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
330 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
331 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
,
332 pScissorsInFixedPoint
[pViewportIndex
[8]].ymax
,
333 pScissorsInFixedPoint
[pViewportIndex
[9]].ymax
,
334 pScissorsInFixedPoint
[pViewportIndex
[10]].ymax
,
335 pScissorsInFixedPoint
[pViewportIndex
[11]].ymax
,
336 pScissorsInFixedPoint
[pViewportIndex
[12]].ymax
,
337 pScissorsInFixedPoint
[pViewportIndex
[13]].ymax
,
338 pScissorsInFixedPoint
[pViewportIndex
[14]].ymax
,
339 pScissorsInFixedPoint
[pViewportIndex
[15]].ymax
);
342 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
344 struct ProcessAttributesChooser
346 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
348 template <typename
... ArgsB
>
349 static FuncType
GetFunc()
351 return ProcessAttributes
<ArgsB
...>;
355 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
357 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief Processes enabled user clip distances. Loads the active clip
362 /// distances from the PA, sets up barycentric equations, and
363 /// stores the results to the output buffer
364 /// @param pa - Primitive Assembly state
365 /// @param primIndex - primitive index to process
366 /// @param clipDistMask - mask of enabled clip distances
367 /// @param pUserClipBuffer - buffer to store results
368 template<uint32_t NumVerts
>
369 void ProcessUserClipDist(const SWR_BACKEND_STATE
& state
, PA_STATE
& pa
, uint32_t primIndex
, float *pRecipW
, float* pUserClipBuffer
)
372 uint32_t clipDistMask
= state
.clipDistanceMask
;
373 while (_BitScanForward(&clipDist
, clipDistMask
))
375 clipDistMask
&= ~(1 << clipDist
);
376 uint32_t clipSlot
= clipDist
>> 2;
377 uint32_t clipComp
= clipDist
& 0x3;
378 uint32_t clipAttribSlot
= clipSlot
== 0 ?
379 state
.vertexClipCullOffset
: state
.vertexClipCullOffset
+ 1;
381 simd4scalar primClipDist
[3];
382 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
384 float vertClipDist
[NumVerts
];
385 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
387 OSALIGNSIMD(float) aVertClipDist
[4];
388 SIMD128::store_ps(aVertClipDist
, primClipDist
[e
]);
389 vertClipDist
[e
] = aVertClipDist
[clipComp
];
392 // setup plane equations for barycentric interpolation in the backend
393 float baryCoeff
[NumVerts
];
394 float last
= vertClipDist
[NumVerts
- 1] * pRecipW
[NumVerts
- 1];
395 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
397 baryCoeff
[e
] = vertClipDist
[e
] * pRecipW
[e
] - last
;
399 baryCoeff
[NumVerts
- 1] = last
;
401 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
403 *(pUserClipBuffer
++) = baryCoeff
[e
];
409 void TransposeVertices(simd4scalar(&dst
)[8], const simdscalar
&src0
, const simdscalar
&src1
, const simdscalar
&src2
)
411 vTranspose3x8(dst
, src0
, src1
, src2
);
415 void TransposeVertices(simd4scalar(&dst
)[16], const simd16scalar
&src0
, const simd16scalar
&src1
, const simd16scalar
&src2
)
417 vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst
), src0
, src1
, src2
, _simd16_setzero_ps());
420 //////////////////////////////////////////////////////////////////////////
421 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
422 /// culling, viewport transform, etc.
423 /// @param pDC - pointer to draw context.
424 /// @param pa - The primitive assembly object.
425 /// @param workerId - thread's worker id. Even thread has a unique id.
426 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
427 /// @param primID - Primitive ID for each triangle.
428 /// @param viewportIdx - viewport array index for each triangle.
429 /// @tparam CT - ConservativeRastFETraits
430 template <typename SIMD_T
, uint32_t SIMD_WIDTH
, typename CT
>
431 void SIMDCALL
BinTrianglesImpl(
435 typename
SIMD_T::Vec4 tri
[3],
437 typename
SIMD_T::Integer
const &primID
)
439 SWR_CONTEXT
*pContext
= pDC
->pContext
;
441 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
443 const API_STATE
& state
= GetApiState(pDC
);
444 const SWR_RASTSTATE
& rastState
= state
.rastState
;
445 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
447 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
449 typename
SIMD_T::Float vRecipW0
= SIMD_T::set1_ps(1.0f
);
450 typename
SIMD_T::Float vRecipW1
= SIMD_T::set1_ps(1.0f
);
451 typename
SIMD_T::Float vRecipW2
= SIMD_T::set1_ps(1.0f
);
453 typename
SIMD_T::Integer viewportIdx
= SIMD_T::setzero_si();
454 typename
SIMD_T::Vec4 vpiAttrib
[3];
455 typename
SIMD_T::Integer vpai
= SIMD_T::setzero_si();
457 if (state
.backendState
.readViewportArrayIndex
)
459 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
461 vpai
= SIMD_T::castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
465 if (state
.backendState
.readViewportArrayIndex
) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
467 // OOB indices => forced to zero.
468 vpai
= SIMD_T::max_epi32(vpai
, SIMD_T::setzero_si());
469 typename
SIMD_T::Integer vNumViewports
= SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
470 typename
SIMD_T::Integer vClearMask
= SIMD_T::cmplt_epi32(vpai
, vNumViewports
);
471 viewportIdx
= SIMD_T::and_si(vClearMask
, vpai
);
478 if (feState
.vpTransformDisable
)
480 // RHW is passed in directly when VP transform is disabled
481 vRecipW0
= tri
[0].v
[3];
482 vRecipW1
= tri
[1].v
[3];
483 vRecipW2
= tri
[2].v
[3];
487 // Perspective divide
488 vRecipW0
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), tri
[0].w
);
489 vRecipW1
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), tri
[1].w
);
490 vRecipW2
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), tri
[2].w
);
492 tri
[0].v
[0] = SIMD_T::mul_ps(tri
[0].v
[0], vRecipW0
);
493 tri
[1].v
[0] = SIMD_T::mul_ps(tri
[1].v
[0], vRecipW1
);
494 tri
[2].v
[0] = SIMD_T::mul_ps(tri
[2].v
[0], vRecipW2
);
496 tri
[0].v
[1] = SIMD_T::mul_ps(tri
[0].v
[1], vRecipW0
);
497 tri
[1].v
[1] = SIMD_T::mul_ps(tri
[1].v
[1], vRecipW1
);
498 tri
[2].v
[1] = SIMD_T::mul_ps(tri
[2].v
[1], vRecipW2
);
500 tri
[0].v
[2] = SIMD_T::mul_ps(tri
[0].v
[2], vRecipW0
);
501 tri
[1].v
[2] = SIMD_T::mul_ps(tri
[1].v
[2], vRecipW1
);
502 tri
[2].v
[2] = SIMD_T::mul_ps(tri
[2].v
[2], vRecipW2
);
504 // Viewport transform to screen space coords
505 if (state
.backendState
.readViewportArrayIndex
)
507 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
511 viewportTransform
<3>(tri
, state
.vpMatrices
);
515 // Adjust for pixel center location
516 typename
SIMD_T::Float offset
= SwrPixelOffsets
<SIMD_T
>::GetOffset(rastState
.pixelLocation
);
518 tri
[0].x
= SIMD_T::add_ps(tri
[0].x
, offset
);
519 tri
[0].y
= SIMD_T::add_ps(tri
[0].y
, offset
);
521 tri
[1].x
= SIMD_T::add_ps(tri
[1].x
, offset
);
522 tri
[1].y
= SIMD_T::add_ps(tri
[1].y
, offset
);
524 tri
[2].x
= SIMD_T::add_ps(tri
[2].x
, offset
);
525 tri
[2].y
= SIMD_T::add_ps(tri
[2].y
, offset
);
527 // Set vXi, vYi to required fixed point precision
528 typename
SIMD_T::Integer vXi
[3], vYi
[3];
529 FPToFixedPoint
<SIMD_T
>(tri
, vXi
, vYi
);
532 typename
SIMD_T::Integer vAi
[3], vBi
[3];
533 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
536 typename
SIMD_T::Integer vDet
[2];
537 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
540 uint32_t maskLo
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet
[0], SIMD_T::setzero_si())));
541 uint32_t maskHi
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet
[1], SIMD_T::setzero_si())));
543 uint32_t cullZeroAreaMask
= maskLo
| (maskHi
<< (SIMD_WIDTH
/ 2));
545 // don't cull degenerate triangles if we're conservatively rasterizing
546 uint32_t origTriMask
= triMask
;
547 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
549 triMask
&= ~cullZeroAreaMask
;
552 // determine front winding tris
555 // 0 area triangles are marked as backfacing regardless of winding order,
556 // which is required behavior for conservative rast and wireframe rendering
557 uint32_t frontWindingTris
;
558 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
560 maskLo
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet
[0], SIMD_T::setzero_si())));
561 maskHi
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet
[1], SIMD_T::setzero_si())));
565 maskLo
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet
[0])));
566 maskHi
= SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet
[1])));
568 frontWindingTris
= maskLo
| (maskHi
<< (SIMD_WIDTH
/ 2));
572 switch ((SWR_CULLMODE
)rastState
.cullMode
)
574 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
575 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
576 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
577 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
578 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
579 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
582 triMask
&= ~cullTris
;
584 if (origTriMask
^ triMask
)
586 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
589 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
590 // compute per tri backface
591 uint32_t frontFaceMask
= frontWindingTris
;
592 uint32_t *pPrimID
= (uint32_t *)&primID
;
593 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
597 PFN_WORK_FUNC pfnWork
;
598 if (CT::IsConservativeT::value
)
600 // determine which edges of the degenerate tri, if any, are valid to rasterize.
601 // used to call the appropriate templated rasterizer function
602 if (cullZeroAreaMask
> 0)
605 const typename
SIMD_T::Integer x0x1Mask
= SIMD_T::cmpeq_epi32(vXi
[0], vXi
[1]);
606 const typename
SIMD_T::Integer y0y1Mask
= SIMD_T::cmpeq_epi32(vYi
[0], vYi
[1]);
608 uint32_t e0Mask
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask
, y0y1Mask
)));
611 const typename
SIMD_T::Integer x1x2Mask
= SIMD_T::cmpeq_epi32(vXi
[1], vXi
[2]);
612 const typename
SIMD_T::Integer y1y2Mask
= SIMD_T::cmpeq_epi32(vYi
[1], vYi
[2]);
614 uint32_t e1Mask
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask
, y1y2Mask
)));
617 // if v0 == v1 & v1 == v2, v0 == v2
618 uint32_t e2Mask
= e0Mask
& e1Mask
;
619 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
621 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
622 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
623 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
625 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
626 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
628 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
629 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
631 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
635 edgeEnable
= 0x00FFFFFF;
640 // degenerate triangles won't be sent to rasterizer; just enable all edges
641 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
642 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
645 SIMDBBOX_T
<SIMD_T
> bbox
;
649 goto endBinTriangles
;
652 // Calc bounding box of triangles
653 calcBoundingBoxIntVertical
<SIMD_T
, CT
>(vXi
, vYi
, bbox
);
655 // determine if triangle falls between pixel centers and discard
656 // only discard for non-MSAA case and when conservative rast is disabled
657 // (xmin + 127) & ~255
658 // (xmax + 128) & ~255
659 if ((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
660 (!CT::IsConservativeT::value
))
662 origTriMask
= triMask
;
667 typename
SIMD_T::Integer xmin
= SIMD_T::add_epi32(bbox
.xmin
, SIMD_T::set1_epi32(127));
668 xmin
= SIMD_T::and_si(xmin
, SIMD_T::set1_epi32(~255));
669 typename
SIMD_T::Integer xmax
= SIMD_T::add_epi32(bbox
.xmax
, SIMD_T::set1_epi32(128));
670 xmax
= SIMD_T::and_si(xmax
, SIMD_T::set1_epi32(~255));
672 typename
SIMD_T::Integer vMaskH
= SIMD_T::cmpeq_epi32(xmin
, xmax
);
674 typename
SIMD_T::Integer ymin
= SIMD_T::add_epi32(bbox
.ymin
, SIMD_T::set1_epi32(127));
675 ymin
= SIMD_T::and_si(ymin
, SIMD_T::set1_epi32(~255));
676 typename
SIMD_T::Integer ymax
= SIMD_T::add_epi32(bbox
.ymax
, SIMD_T::set1_epi32(128));
677 ymax
= SIMD_T::and_si(ymax
, SIMD_T::set1_epi32(~255));
679 typename
SIMD_T::Integer vMaskV
= SIMD_T::cmpeq_epi32(ymin
, ymax
);
681 vMaskV
= SIMD_T::or_si(vMaskH
, vMaskV
);
682 cullCenterMask
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV
));
685 triMask
&= ~cullCenterMask
;
687 if (origTriMask
^ triMask
)
689 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
693 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
694 // Gather the AOS effective scissor rects based on the per-prim VP index.
695 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
697 typename
SIMD_T::Integer scisXmin
, scisYmin
, scisXmax
, scisYmax
;
699 if (state
.backendState
.readViewportArrayIndex
)
701 GatherScissors(&state
.scissorsInFixedPoint
[0], pViewportIndex
, scisXmin
, scisYmin
, scisXmax
, scisYmax
);
703 else // broadcast fast path for non-VPAI case.
705 scisXmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
706 scisYmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
707 scisXmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
708 scisYmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
711 // Make triangle bbox inclusive
712 bbox
.xmax
= SIMD_T::sub_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1));
713 bbox
.ymax
= SIMD_T::sub_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1));
715 bbox
.xmin
= SIMD_T::max_epi32(bbox
.xmin
, scisXmin
);
716 bbox
.ymin
= SIMD_T::max_epi32(bbox
.ymin
, scisYmin
);
717 bbox
.xmax
= SIMD_T::min_epi32(bbox
.xmax
, scisXmax
);
718 bbox
.ymax
= SIMD_T::min_epi32(bbox
.ymax
, scisYmax
);
721 if (CT::IsConservativeT::value
)
723 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
724 // some area. Bump the xmax/ymax edges out
726 typename
SIMD_T::Integer topEqualsBottom
= SIMD_T::cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
727 bbox
.ymax
= SIMD_T::blendv_epi32(bbox
.ymax
, SIMD_T::add_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1)), topEqualsBottom
);
729 typename
SIMD_T::Integer leftEqualsRight
= SIMD_T::cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
730 bbox
.xmax
= SIMD_T::blendv_epi32(bbox
.xmax
, SIMD_T::add_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1)), leftEqualsRight
);
733 // Cull tris completely outside scissor
735 typename
SIMD_T::Integer maskOutsideScissorX
= SIMD_T::cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
736 typename
SIMD_T::Integer maskOutsideScissorY
= SIMD_T::cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
737 typename
SIMD_T::Integer maskOutsideScissorXY
= SIMD_T::or_si(maskOutsideScissorX
, maskOutsideScissorY
);
738 uint32_t maskOutsideScissor
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY
));
739 triMask
= triMask
& ~maskOutsideScissor
;
745 // Send surviving triangles to the line or point binner based on fill mode
746 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
748 // Simple non-conformant wireframe mode, useful for debugging
749 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
750 typename
SIMD_T::Vec4 line
[2];
751 typename
SIMD_T::Float recipW
[2];
755 recipW
[0] = vRecipW0
;
756 recipW
[1] = vRecipW1
;
758 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
762 recipW
[0] = vRecipW1
;
763 recipW
[1] = vRecipW2
;
765 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
769 recipW
[0] = vRecipW2
;
770 recipW
[1] = vRecipW0
;
772 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
774 AR_END(FEBinTriangles
, 1);
777 else if (rastState
.fillMode
== SWR_FILLMODE_POINT
)
780 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, &tri
[0], triMask
, primID
, viewportIdx
);
781 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, &tri
[1], triMask
, primID
, viewportIdx
);
782 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(pDC
, pa
, workerId
, &tri
[2], triMask
, primID
, viewportIdx
);
784 AR_END(FEBinTriangles
, 1);
788 // Convert triangle bbox to macrotile units.
789 bbox
.xmin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmin
);
790 bbox
.ymin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymin
);
791 bbox
.xmax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmax
);
792 bbox
.ymax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymax
);
794 OSALIGNSIMD16(uint32_t) aMTLeft
[SIMD_WIDTH
], aMTRight
[SIMD_WIDTH
], aMTTop
[SIMD_WIDTH
], aMTBottom
[SIMD_WIDTH
];
796 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTLeft
), bbox
.xmin
);
797 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTRight
), bbox
.xmax
);
798 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTTop
), bbox
.ymin
);
799 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTBottom
), bbox
.ymax
);
801 // transpose verts needed for backend
802 /// @todo modify BE to take non-transformed verts
803 OSALIGNSIMD16(simd4scalar
) vHorizX
[SIMD_WIDTH
];
804 OSALIGNSIMD16(simd4scalar
) vHorizY
[SIMD_WIDTH
];
805 OSALIGNSIMD16(simd4scalar
) vHorizZ
[SIMD_WIDTH
];
806 OSALIGNSIMD16(simd4scalar
) vHorizW
[SIMD_WIDTH
];
808 TransposeVertices(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
809 TransposeVertices(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
810 TransposeVertices(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
811 TransposeVertices(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
813 // store render target array index
814 OSALIGNSIMD16(uint32_t) aRTAI
[SIMD_WIDTH
];
815 if (state
.backendState
.readRenderTargetArrayIndex
)
817 typename
SIMD_T::Vec4 vRtai
[3];
818 pa
.Assemble(VERTEX_SGV_SLOT
, vRtai
);
819 typename
SIMD_T::Integer vRtaii
;
820 vRtaii
= SIMD_T::castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
821 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), vRtaii
);
825 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), SIMD_T::setzero_si());
829 // scan remaining valid triangles and bin each separately
830 while (_BitScanForward(&triIndex
, triMask
))
832 uint32_t linkageCount
= state
.backendState
.numAttributes
;
833 uint32_t numScalarAttribs
= linkageCount
* 4;
839 if (CT::IsConservativeT::value
)
841 // only rasterize valid edges if we have a degenerate primitive
842 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
843 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
844 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
846 // Degenerate triangles are required to be constant interpolated
847 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
851 isDegenerate
= false;
852 work
.pfnWork
= pfnWork
;
855 // Select attribute processor
856 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
857 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
859 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
861 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
862 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
863 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
865 auto pArena
= pDC
->pArena
;
866 SWR_ASSERT(pArena
!= nullptr);
868 // store active attribs
869 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
870 desc
.pAttribs
= pAttribs
;
871 desc
.numAttribs
= linkageCount
;
872 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
874 // store triangle vertex data
875 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
877 SIMD128::store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
878 SIMD128::store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
879 SIMD128::store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
880 SIMD128::store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
882 // store user clip distances
883 if (state
.backendState
.clipDistanceMask
)
885 uint32_t numClipDist
= _mm_popcnt_u32(state
.backendState
.clipDistanceMask
);
886 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
887 ProcessUserClipDist
<3>(state
.backendState
, pa
, triIndex
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
890 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
892 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
894 #if KNOB_ENABLE_TOSS_POINTS
895 if (!KNOB_TOSS_SETUP_TRIS
)
898 pTileMgr
->enqueue(x
, y
, &work
);
903 triMask
&= ~(1 << triIndex
);
906 AR_END(FEBinTriangles
, 1);
909 template <typename CT
>
916 simdscalari
const &primID
)
918 BinTrianglesImpl
<SIMD256
, KNOB_SIMD_WIDTH
, CT
>(pDC
, pa
, workerId
, tri
, triMask
, primID
);
921 #if USE_SIMD16_FRONTEND
922 template <typename CT
>
923 void SIMDCALL
BinTriangles_simd16(
929 simd16scalari
const &primID
)
931 BinTrianglesImpl
<SIMD512
, KNOB_SIMD16_WIDTH
, CT
>(pDC
, pa
, workerId
, tri
, triMask
, primID
);
935 struct FEBinTrianglesChooser
937 typedef PFN_PROCESS_PRIMS FuncType
;
939 template <typename
... ArgsB
>
940 static FuncType
GetFunc()
942 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
946 // Selector for correct templated BinTrinagles function
947 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
949 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
952 #if USE_SIMD16_FRONTEND
953 struct FEBinTrianglesChooser_simd16
955 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType
;
957 template <typename
... ArgsB
>
958 static FuncType
GetFunc()
960 return BinTriangles_simd16
<ConservativeRastFETraits
<ArgsB
...>>;
964 // Selector for correct templated BinTrinagles function
965 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
)
967 return TemplateArgUnroller
<FEBinTrianglesChooser_simd16
>::GetFunc(IsConservative
);
972 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
973 void BinPostSetupPointsImpl(
977 typename
SIMD_T::Vec4 prim
[],
979 typename
SIMD_T::Integer
const &primID
,
980 typename
SIMD_T::Integer
const &viewportIdx
)
982 SWR_CONTEXT
*pContext
= pDC
->pContext
;
984 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
986 typename
SIMD_T::Vec4
&primVerts
= prim
[0];
988 const API_STATE
& state
= GetApiState(pDC
);
989 const SWR_RASTSTATE
& rastState
= state
.rastState
;
990 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
992 // Select attribute processor
993 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
994 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
996 // convert to fixed point
997 typename
SIMD_T::Integer vXi
, vYi
;
999 vXi
= fpToFixedPointVertical
<SIMD_T
>(primVerts
.x
);
1000 vYi
= fpToFixedPointVertical
<SIMD_T
>(primVerts
.y
);
1002 if (CanUseSimplePoints(pDC
))
1004 // adjust for ymin-xmin rule
1005 vXi
= SIMD_T::sub_epi32(vXi
, SIMD_T::set1_epi32(1));
1006 vYi
= SIMD_T::sub_epi32(vYi
, SIMD_T::set1_epi32(1));
1008 // cull points off the ymin-xmin edge of the viewport
1009 primMask
&= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi
));
1010 primMask
&= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi
));
1012 // compute macro tile coordinates
1013 typename
SIMD_T::Integer macroX
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(vXi
);
1014 typename
SIMD_T::Integer macroY
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(vYi
);
1016 OSALIGNSIMD16(uint32_t) aMacroX
[SIMD_WIDTH
], aMacroY
[SIMD_WIDTH
];
1018 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMacroX
), macroX
);
1019 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMacroY
), macroY
);
1021 // compute raster tile coordinates
1022 typename
SIMD_T::Integer rasterX
= SIMD_T::template srai_epi32
<KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
>(vXi
);
1023 typename
SIMD_T::Integer rasterY
= SIMD_T::template srai_epi32
<KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
>(vYi
);
1025 // compute raster tile relative x,y for coverage mask
1026 typename
SIMD_T::Integer tileAlignedX
= SIMD_T::template slli_epi32
<KNOB_TILE_X_DIM_SHIFT
>(rasterX
);
1027 typename
SIMD_T::Integer tileAlignedY
= SIMD_T::template slli_epi32
<KNOB_TILE_Y_DIM_SHIFT
>(rasterY
);
1029 typename
SIMD_T::Integer tileRelativeX
= SIMD_T::sub_epi32(SIMD_T::template srai_epi32
<FIXED_POINT_SHIFT
>(vXi
), tileAlignedX
);
1030 typename
SIMD_T::Integer tileRelativeY
= SIMD_T::sub_epi32(SIMD_T::template srai_epi32
<FIXED_POINT_SHIFT
>(vYi
), tileAlignedY
);
1032 OSALIGNSIMD16(uint32_t) aTileRelativeX
[SIMD_WIDTH
];
1033 OSALIGNSIMD16(uint32_t) aTileRelativeY
[SIMD_WIDTH
];
1035 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aTileRelativeX
), tileRelativeX
);
1036 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aTileRelativeY
), tileRelativeY
);
1038 OSALIGNSIMD16(uint32_t) aTileAlignedX
[SIMD_WIDTH
];
1039 OSALIGNSIMD16(uint32_t) aTileAlignedY
[SIMD_WIDTH
];
1041 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aTileAlignedX
), tileAlignedX
);
1042 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aTileAlignedY
), tileAlignedY
);
1044 OSALIGNSIMD16(float) aZ
[SIMD_WIDTH
];
1045 SIMD_T::store_ps(reinterpret_cast<float *>(aZ
), primVerts
.z
);
1047 // store render target array index
1048 OSALIGNSIMD16(uint32_t) aRTAI
[SIMD_WIDTH
];
1049 if (state
.backendState
.readRenderTargetArrayIndex
)
1051 typename
SIMD_T::Vec4 vRtai
;
1052 pa
.Assemble(VERTEX_SGV_SLOT
, &vRtai
);
1053 typename
SIMD_T::Integer vRtaii
= SIMD_T::castps_si(vRtai
[VERTEX_SGV_RTAI_COMP
]);
1054 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), vRtaii
);
1058 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), SIMD_T::setzero_si());
1061 uint32_t *pPrimID
= (uint32_t *)&primID
;
1062 DWORD primIndex
= 0;
1064 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1066 // scan remaining valid triangles and bin each separately
1067 while (_BitScanForward(&primIndex
, primMask
))
1069 uint32_t linkageCount
= backendState
.numAttributes
;
1070 uint32_t numScalarAttribs
= linkageCount
* 4;
1075 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1077 // points are always front facing
1078 desc
.triFlags
.frontFacing
= 1;
1079 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1080 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1082 work
.pfnWork
= RasterizeSimplePoint
;
1084 auto pArena
= pDC
->pArena
;
1085 SWR_ASSERT(pArena
!= nullptr);
1088 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1089 desc
.pAttribs
= pAttribs
;
1090 desc
.numAttribs
= linkageCount
;
1092 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1094 // store raster tile aligned x, y, perspective correct z
1095 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1096 desc
.pTriBuffer
= pTriBuffer
;
1097 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1098 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1099 *pTriBuffer
= aZ
[primIndex
];
1101 uint32_t tX
= aTileRelativeX
[primIndex
];
1102 uint32_t tY
= aTileRelativeY
[primIndex
];
1104 // pack the relative x,y into the coverageMask, the rasterizer will
1105 // generate the true coverage mask from it
1106 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1109 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1110 #if KNOB_ENABLE_TOSS_POINTS
1111 if (!KNOB_TOSS_SETUP_TRIS
)
1114 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1117 primMask
&= ~(1 << primIndex
);
1122 // non simple points need to be potentially binned to multiple macro tiles
1123 typename
SIMD_T::Float vPointSize
;
1125 if (rastState
.pointParam
)
1127 typename
SIMD_T::Vec4 size
[3];
1128 pa
.Assemble(VERTEX_SGV_SLOT
, size
);
1129 vPointSize
= size
[0][VERTEX_SGV_POINT_SIZE_COMP
];
1133 vPointSize
= SIMD_T::set1_ps(rastState
.pointSize
);
1136 // bloat point to bbox
1137 SIMDBBOX_T
<SIMD_T
> bbox
;
1139 bbox
.xmin
= bbox
.xmax
= vXi
;
1140 bbox
.ymin
= bbox
.ymax
= vYi
;
1142 typename
SIMD_T::Float vHalfWidth
= SIMD_T::mul_ps(vPointSize
, SIMD_T::set1_ps(0.5f
));
1143 typename
SIMD_T::Integer vHalfWidthi
= fpToFixedPointVertical
<SIMD_T
>(vHalfWidth
);
1145 bbox
.xmin
= SIMD_T::sub_epi32(bbox
.xmin
, vHalfWidthi
);
1146 bbox
.xmax
= SIMD_T::add_epi32(bbox
.xmax
, vHalfWidthi
);
1147 bbox
.ymin
= SIMD_T::sub_epi32(bbox
.ymin
, vHalfWidthi
);
1148 bbox
.ymax
= SIMD_T::add_epi32(bbox
.ymax
, vHalfWidthi
);
1150 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1151 // Gather the AOS effective scissor rects based on the per-prim VP index.
1152 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1154 typename
SIMD_T::Integer scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1156 if (state
.backendState
.readViewportArrayIndex
)
1158 GatherScissors(&state
.scissorsInFixedPoint
[0], pViewportIndex
, scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1160 else // broadcast fast path for non-VPAI case.
1162 scisXmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1163 scisYmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1164 scisXmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1165 scisYmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1168 bbox
.xmin
= SIMD_T::max_epi32(bbox
.xmin
, scisXmin
);
1169 bbox
.ymin
= SIMD_T::max_epi32(bbox
.ymin
, scisYmin
);
1170 bbox
.xmax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1)), scisXmax
);
1171 bbox
.ymax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1)), scisYmax
);
1174 // Cull bloated points completely outside scissor
1175 typename
SIMD_T::Integer maskOutsideScissorX
= SIMD_T::cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1176 typename
SIMD_T::Integer maskOutsideScissorY
= SIMD_T::cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1177 typename
SIMD_T::Integer maskOutsideScissorXY
= SIMD_T::or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1178 uint32_t maskOutsideScissor
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY
));
1179 primMask
= primMask
& ~maskOutsideScissor
;
1181 // Convert bbox to macrotile units.
1182 bbox
.xmin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmin
);
1183 bbox
.ymin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymin
);
1184 bbox
.xmax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmax
);
1185 bbox
.ymax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymax
);
1187 OSALIGNSIMD16(uint32_t) aMTLeft
[SIMD_WIDTH
], aMTRight
[SIMD_WIDTH
], aMTTop
[SIMD_WIDTH
], aMTBottom
[SIMD_WIDTH
];
1189 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTLeft
), bbox
.xmin
);
1190 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTRight
), bbox
.xmax
);
1191 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTTop
), bbox
.ymin
);
1192 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTBottom
), bbox
.ymax
);
1194 // store render target array index
1195 OSALIGNSIMD16(uint32_t) aRTAI
[SIMD_WIDTH
];
1196 if (state
.backendState
.readRenderTargetArrayIndex
)
1198 typename
SIMD_T::Vec4 vRtai
[2];
1199 pa
.Assemble(VERTEX_SGV_SLOT
, vRtai
);
1200 typename
SIMD_T::Integer vRtaii
= SIMD_T::castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
1201 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), vRtaii
);
1205 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), SIMD_T::setzero_si());
1208 OSALIGNSIMD16(float) aPointSize
[SIMD_WIDTH
];
1209 SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize
), vPointSize
);
1211 uint32_t *pPrimID
= (uint32_t *)&primID
;
1213 OSALIGNSIMD16(float) aPrimVertsX
[SIMD_WIDTH
];
1214 OSALIGNSIMD16(float) aPrimVertsY
[SIMD_WIDTH
];
1215 OSALIGNSIMD16(float) aPrimVertsZ
[SIMD_WIDTH
];
1217 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX
), primVerts
.x
);
1218 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY
), primVerts
.y
);
1219 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ
), primVerts
.z
);
1221 // scan remaining valid prims and bin each separately
1222 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1224 while (_BitScanForward(&primIndex
, primMask
))
1226 uint32_t linkageCount
= backendState
.numAttributes
;
1227 uint32_t numScalarAttribs
= linkageCount
* 4;
1232 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1234 desc
.triFlags
.frontFacing
= 1;
1235 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1236 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1237 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1239 work
.pfnWork
= RasterizeTriPoint
;
1241 auto pArena
= pDC
->pArena
;
1242 SWR_ASSERT(pArena
!= nullptr);
1244 // store active attribs
1245 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1246 desc
.numAttribs
= linkageCount
;
1247 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1249 // store point vertex data
1250 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1251 desc
.pTriBuffer
= pTriBuffer
;
1252 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
1253 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
1254 *pTriBuffer
= aPrimVertsZ
[primIndex
];
1256 // store user clip distances
1257 if (backendState
.clipDistanceMask
)
1259 uint32_t numClipDist
= _mm_popcnt_u32(backendState
.clipDistanceMask
);
1260 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1263 ProcessUserClipDist
<1>(backendState
, pa
, primIndex
, &one
, dists
);
1264 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
1265 desc
.pUserClipBuffer
[3 * i
+ 0] = 0.0f
;
1266 desc
.pUserClipBuffer
[3 * i
+ 1] = 0.0f
;
1267 desc
.pUserClipBuffer
[3 * i
+ 2] = dists
[i
];
1271 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1272 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1274 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1276 #if KNOB_ENABLE_TOSS_POINTS
1277 if (!KNOB_TOSS_SETUP_TRIS
)
1280 pTileMgr
->enqueue(x
, y
, &work
);
1285 primMask
&= ~(1 << primIndex
);
1289 AR_END(FEBinPoints
, 1);
1292 //////////////////////////////////////////////////////////////////////////
1293 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1294 /// @param pDC - pointer to draw context.
1295 /// @param pa - The primitive assembly object.
1296 /// @param workerId - thread's worker id. Even thread has a unique id.
1297 /// @param tri - Contains point position data for SIMDs worth of points.
1298 /// @param primID - Primitive ID for each point.
1299 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1304 typename
SIMD_T::Vec4 prim
[3],
1306 typename
SIMD_T::Integer
const &primID
)
1308 const API_STATE
& state
= GetApiState(pDC
);
1309 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1310 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1312 // Read back viewport index if required
1313 typename
SIMD_T::Integer viewportIdx
= SIMD_T::setzero_si();
1314 typename
SIMD_T::Vec4 vpiAttrib
[1];
1315 typename
SIMD_T::Integer vpai
= SIMD_T::setzero_si();
1317 if (state
.backendState
.readViewportArrayIndex
)
1319 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
1321 vpai
= SIMD_T::castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1325 if (state
.backendState
.readViewportArrayIndex
) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1327 // OOB indices => forced to zero.
1328 vpai
= SIMD_T::max_epi32(vpai
, SIMD_T::setzero_si());
1329 typename
SIMD_T::Integer vNumViewports
= SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1330 typename
SIMD_T::Integer vClearMask
= SIMD_T::cmplt_epi32(vpai
, vNumViewports
);
1331 viewportIdx
= SIMD_T::and_si(vClearMask
, vpai
);
1338 if (!feState
.vpTransformDisable
)
1340 // perspective divide
1341 typename
SIMD_T::Float vRecipW0
= SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), prim
[0].w
);
1343 prim
[0].x
= SIMD_T::mul_ps(prim
[0].x
, vRecipW0
);
1344 prim
[0].y
= SIMD_T::mul_ps(prim
[0].y
, vRecipW0
);
1345 prim
[0].z
= SIMD_T::mul_ps(prim
[0].z
, vRecipW0
);
1347 // viewport transform to screen coords
1348 if (state
.backendState
.readViewportArrayIndex
)
1350 viewportTransform
<1>(prim
, state
.vpMatrices
, viewportIdx
);
1354 viewportTransform
<1>(prim
, state
.vpMatrices
);
1358 typename
SIMD_T::Float offset
= SwrPixelOffsets
<SIMD_T
>::GetOffset(rastState
.pixelLocation
);
1360 prim
[0].x
= SIMD_T::add_ps(prim
[0].x
, offset
);
1361 prim
[0].y
= SIMD_T::add_ps(prim
[0].y
, offset
);
1363 BinPostSetupPointsImpl
<SIMD_T
, SIMD_WIDTH
>(
1379 simdscalari
const &primID
)
1381 BinPointsImpl
<SIMD256
, KNOB_SIMD_WIDTH
>(
1390 #if USE_SIMD16_FRONTEND
1391 void SIMDCALL
BinPoints_simd16(
1395 simd16vector prim
[3],
1397 simd16scalari
const &primID
)
1399 BinPointsImpl
<SIMD512
, KNOB_SIMD16_WIDTH
>(
1409 //////////////////////////////////////////////////////////////////////////
1410 /// @brief Bin SIMD lines to the backend.
1411 /// @param pDC - pointer to draw context.
1412 /// @param pa - The primitive assembly object.
1413 /// @param workerId - thread's worker id. Even thread has a unique id.
1414 /// @param tri - Contains line position data for SIMDs worth of points.
1415 /// @param primID - Primitive ID for each line.
1416 /// @param viewportIdx - Viewport Array Index for each line.
1417 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1418 void BinPostSetupLinesImpl(
1422 typename
SIMD_T::Vec4 prim
[],
1423 typename
SIMD_T::Float recipW
[],
1425 typename
SIMD_T::Integer
const &primID
,
1426 typename
SIMD_T::Integer
const &viewportIdx
)
1428 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1430 AR_BEGIN(FEBinLines
, pDC
->drawId
);
1432 const API_STATE
&state
= GetApiState(pDC
);
1433 const SWR_RASTSTATE
&rastState
= state
.rastState
;
1435 // Select attribute processor
1436 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
1437 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1439 typename
SIMD_T::Float
&vRecipW0
= recipW
[0];
1440 typename
SIMD_T::Float
&vRecipW1
= recipW
[1];
1442 // convert to fixed point
1443 typename
SIMD_T::Integer vXi
[2], vYi
[2];
1445 vXi
[0] = fpToFixedPointVertical
<SIMD_T
>(prim
[0].x
);
1446 vYi
[0] = fpToFixedPointVertical
<SIMD_T
>(prim
[0].y
);
1447 vXi
[1] = fpToFixedPointVertical
<SIMD_T
>(prim
[1].x
);
1448 vYi
[1] = fpToFixedPointVertical
<SIMD_T
>(prim
[1].y
);
1450 // compute x-major vs y-major mask
1451 typename
SIMD_T::Integer xLength
= SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi
[0], vXi
[1]));
1452 typename
SIMD_T::Integer yLength
= SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi
[0], vYi
[1]));
1453 typename
SIMD_T::Float vYmajorMask
= SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength
, xLength
));
1454 uint32_t yMajorMask
= SIMD_T::movemask_ps(vYmajorMask
);
1456 // cull zero-length lines
1457 typename
SIMD_T::Integer vZeroLengthMask
= SIMD_T::cmpeq_epi32(xLength
, SIMD_T::setzero_si());
1458 vZeroLengthMask
= SIMD_T::and_si(vZeroLengthMask
, SIMD_T::cmpeq_epi32(yLength
, SIMD_T::setzero_si()));
1460 primMask
&= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask
));
1462 uint32_t *pPrimID
= (uint32_t *)&primID
;
1463 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1465 // Calc bounding box of lines
1466 SIMDBBOX_T
<SIMD_T
> bbox
;
1467 bbox
.xmin
= SIMD_T::min_epi32(vXi
[0], vXi
[1]);
1468 bbox
.xmax
= SIMD_T::max_epi32(vXi
[0], vXi
[1]);
1469 bbox
.ymin
= SIMD_T::min_epi32(vYi
[0], vYi
[1]);
1470 bbox
.ymax
= SIMD_T::max_epi32(vYi
[0], vYi
[1]);
1472 // bloat bbox by line width along minor axis
1473 typename
SIMD_T::Float vHalfWidth
= SIMD_T::set1_ps(rastState
.lineWidth
/ 2.0f
);
1474 typename
SIMD_T::Integer vHalfWidthi
= fpToFixedPointVertical
<SIMD_T
>(vHalfWidth
);
1476 SIMDBBOX_T
<SIMD_T
> bloatBox
;
1478 bloatBox
.xmin
= SIMD_T::sub_epi32(bbox
.xmin
, vHalfWidthi
);
1479 bloatBox
.xmax
= SIMD_T::add_epi32(bbox
.xmax
, vHalfWidthi
);
1480 bloatBox
.ymin
= SIMD_T::sub_epi32(bbox
.ymin
, vHalfWidthi
);
1481 bloatBox
.ymax
= SIMD_T::add_epi32(bbox
.ymax
, vHalfWidthi
);
1483 bbox
.xmin
= SIMD_T::blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
1484 bbox
.xmax
= SIMD_T::blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
1485 bbox
.ymin
= SIMD_T::blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
1486 bbox
.ymax
= SIMD_T::blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
1488 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1490 typename
SIMD_T::Integer scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1492 if (state
.backendState
.readViewportArrayIndex
)
1494 GatherScissors(&state
.scissorsInFixedPoint
[0], pViewportIndex
, scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1496 else // broadcast fast path for non-VPAI case.
1498 scisXmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1499 scisYmin
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1500 scisXmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1501 scisYmax
= SIMD_T::set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1504 bbox
.xmin
= SIMD_T::max_epi32(bbox
.xmin
, scisXmin
);
1505 bbox
.ymin
= SIMD_T::max_epi32(bbox
.ymin
, scisYmin
);
1506 bbox
.xmax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.xmax
, SIMD_T::set1_epi32(1)), scisXmax
);
1507 bbox
.ymax
= SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox
.ymax
, SIMD_T::set1_epi32(1)), scisYmax
);
1510 // Cull prims completely outside scissor
1512 typename
SIMD_T::Integer maskOutsideScissorX
= SIMD_T::cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1513 typename
SIMD_T::Integer maskOutsideScissorY
= SIMD_T::cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1514 typename
SIMD_T::Integer maskOutsideScissorXY
= SIMD_T::or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1515 uint32_t maskOutsideScissor
= SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY
));
1516 primMask
= primMask
& ~maskOutsideScissor
;
1519 // transpose verts needed for backend
1520 /// @todo modify BE to take non-transformed verts
1521 OSALIGNSIMD16(simd4scalar
) vHorizX
[SIMD_WIDTH
];
1522 OSALIGNSIMD16(simd4scalar
) vHorizY
[SIMD_WIDTH
];
1523 OSALIGNSIMD16(simd4scalar
) vHorizZ
[SIMD_WIDTH
];
1524 OSALIGNSIMD16(simd4scalar
) vHorizW
[SIMD_WIDTH
];
1531 // Convert triangle bbox to macrotile units.
1532 bbox
.xmin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmin
);
1533 bbox
.ymin
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymin
);
1534 bbox
.xmax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_X_DIM_FIXED_SHIFT
>(bbox
.xmax
);
1535 bbox
.ymax
= SIMD_T::template srai_epi32
<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
>(bbox
.ymax
);
1537 OSALIGNSIMD16(uint32_t) aMTLeft
[SIMD_WIDTH
], aMTRight
[SIMD_WIDTH
], aMTTop
[SIMD_WIDTH
], aMTBottom
[SIMD_WIDTH
];
1539 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTLeft
), bbox
.xmin
);
1540 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTRight
), bbox
.xmax
);
1541 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTTop
), bbox
.ymin
);
1542 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aMTBottom
), bbox
.ymax
);
1544 TransposeVertices(vHorizX
, prim
[0].x
, prim
[1].x
, SIMD_T::setzero_ps());
1545 TransposeVertices(vHorizY
, prim
[0].y
, prim
[1].y
, SIMD_T::setzero_ps());
1546 TransposeVertices(vHorizZ
, prim
[0].z
, prim
[1].z
, SIMD_T::setzero_ps());
1547 TransposeVertices(vHorizW
, vRecipW0
, vRecipW1
, SIMD_T::setzero_ps());
1549 // store render target array index
1550 OSALIGNSIMD16(uint32_t) aRTAI
[SIMD_WIDTH
];
1551 if (state
.backendState
.readRenderTargetArrayIndex
)
1553 typename
SIMD_T::Vec4 vRtai
[2];
1554 pa
.Assemble(VERTEX_SGV_SLOT
, vRtai
);
1555 typename
SIMD_T::Integer vRtaii
= SIMD_T::castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
1556 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), vRtaii
);
1560 SIMD_T::store_si(reinterpret_cast<typename
SIMD_T::Integer
*>(aRTAI
), SIMD_T::setzero_si());
1563 // scan remaining valid prims and bin each separately
1565 while (_BitScanForward(&primIndex
, primMask
))
1567 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1568 uint32_t numScalarAttribs
= linkageCount
* 4;
1573 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1575 desc
.triFlags
.frontFacing
= 1;
1576 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
1577 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1578 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1580 work
.pfnWork
= RasterizeLine
;
1582 auto pArena
= pDC
->pArena
;
1583 SWR_ASSERT(pArena
!= nullptr);
1585 // store active attribs
1586 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1587 desc
.numAttribs
= linkageCount
;
1588 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1590 // store line vertex data
1591 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1593 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
1594 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
1595 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
1596 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
1598 // store user clip distances
1599 if (state
.backendState
.clipDistanceMask
)
1601 uint32_t numClipDist
= _mm_popcnt_u32(state
.backendState
.clipDistanceMask
);
1602 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
1603 ProcessUserClipDist
<2>(state
.backendState
, pa
, primIndex
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1606 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1607 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1609 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1611 #if KNOB_ENABLE_TOSS_POINTS
1612 if (!KNOB_TOSS_SETUP_TRIS
)
1615 pTileMgr
->enqueue(x
, y
, &work
);
1620 primMask
&= ~(1 << primIndex
);
1625 AR_END(FEBinLines
, 1);
1628 //////////////////////////////////////////////////////////////////////////
1629 /// @brief Bin SIMD lines to the backend.
1630 /// @param pDC - pointer to draw context.
1631 /// @param pa - The primitive assembly object.
1632 /// @param workerId - thread's worker id. Even thread has a unique id.
1633 /// @param tri - Contains line position data for SIMDs worth of points.
1634 /// @param primID - Primitive ID for each line.
1635 /// @param viewportIdx - Viewport Array Index for each line.
1636 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1637 void SIMDCALL
BinLinesImpl(
1641 typename
SIMD_T::Vec4 prim
[3],
1643 typename
SIMD_T::Integer
const &primID
)
1645 const API_STATE
& state
= GetApiState(pDC
);
1646 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1647 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1649 typename
SIMD_T::Float vRecipW
[2] = { SIMD_T::set1_ps(1.0f
), SIMD_T::set1_ps(1.0f
) };
1651 typename
SIMD_T::Integer viewportIdx
= SIMD_T::setzero_si();
1652 typename
SIMD_T::Vec4 vpiAttrib
[2];
1653 typename
SIMD_T::Integer vpai
= SIMD_T::setzero_si();
1655 if (state
.backendState
.readViewportArrayIndex
)
1657 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
1658 vpai
= SIMD_T::castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1662 if (state
.backendState
.readViewportArrayIndex
) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1664 // OOB indices => forced to zero.
1665 vpai
= SIMD_T::max_epi32(vpai
, SIMD_T::setzero_si());
1666 typename
SIMD_T::Integer vNumViewports
= SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1667 typename
SIMD_T::Integer vClearMask
= SIMD_T::cmplt_epi32(vpai
, vNumViewports
);
1668 viewportIdx
= SIMD_T::and_si(vClearMask
, vpai
);
1671 if (!feState
.vpTransformDisable
)
1673 // perspective divide
1674 vRecipW
[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), prim
[0].w
);
1675 vRecipW
[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f
), prim
[1].w
);
1677 prim
[0].v
[0] = SIMD_T::mul_ps(prim
[0].v
[0], vRecipW
[0]);
1678 prim
[1].v
[0] = SIMD_T::mul_ps(prim
[1].v
[0], vRecipW
[1]);
1680 prim
[0].v
[1] = SIMD_T::mul_ps(prim
[0].v
[1], vRecipW
[0]);
1681 prim
[1].v
[1] = SIMD_T::mul_ps(prim
[1].v
[1], vRecipW
[1]);
1683 prim
[0].v
[2] = SIMD_T::mul_ps(prim
[0].v
[2], vRecipW
[0]);
1684 prim
[1].v
[2] = SIMD_T::mul_ps(prim
[1].v
[2], vRecipW
[1]);
1686 // viewport transform to screen coords
1687 if (state
.backendState
.readViewportArrayIndex
)
1689 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
1693 viewportTransform
<2>(prim
, state
.vpMatrices
);
1697 // adjust for pixel center location
1698 typename
SIMD_T::Float offset
= SwrPixelOffsets
<SIMD_T
>::GetOffset(rastState
.pixelLocation
);
1700 prim
[0].x
= SIMD_T::add_ps(prim
[0].x
, offset
);
1701 prim
[0].y
= SIMD_T::add_ps(prim
[0].y
, offset
);
1703 prim
[1].x
= SIMD_T::add_ps(prim
[1].x
, offset
);
1704 prim
[1].y
= SIMD_T::add_ps(prim
[1].y
, offset
);
1706 BinPostSetupLinesImpl
<SIMD_T
, SIMD_WIDTH
>(
1723 simdscalari
const &primID
)
1725 BinLinesImpl
<SIMD256
, KNOB_SIMD_WIDTH
>(pDC
, pa
, workerId
, prim
, primMask
, primID
);
1728 #if USE_SIMD16_FRONTEND
1729 void SIMDCALL
BinLines_simd16(
1733 simd16vector prim
[3],
1735 simd16scalari
const &primID
)
1737 BinLinesImpl
<SIMD512
, KNOB_SIMD16_WIDTH
>(pDC
, pa
, workerId
, prim
, primMask
, primID
);