1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for the macrotile binner
27 ******************************************************************************/
32 #include "conservativeRast.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
39 void BinPostSetupLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], simdscalar vRecipW
[2], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
40 void BinPostSetupPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
42 #if USE_SIMD16_FRONTEND
43 void BinPostSetupLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], simd16scalar vRecipW
[2], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
44 void BinPostSetupPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
47 //////////////////////////////////////////////////////////////////////////
48 /// @brief Processes attributes for the backend based on linkage mask and
49 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
50 /// @param pDC - Draw context
51 /// @param pa - Primitive Assembly state
52 /// @param linkageMask - Specifies which VS outputs are routed to PS.
53 /// @param pLinkageMap - maps VS attribute slot to PS slot
54 /// @param triIndex - Triangle to process attributes for
55 /// @param pBuffer - Output result
56 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
57 INLINE
void ProcessAttributes(
64 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
65 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
66 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
67 uint32_t constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
68 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
69 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
71 static const float constTable
[3][4] = {
72 { 0.0f
, 0.0f
, 0.0f
, 0.0f
},
73 { 0.0f
, 0.0f
, 0.0f
, 1.0f
},
74 { 1.0f
, 1.0f
, 1.0f
, 1.0f
}
77 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
80 if (IsSwizzledT::value
)
82 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
83 inputSlot
= backendState
.vertexAttribOffset
+ attribSwizzle
.sourceAttrib
;
88 inputSlot
= backendState
.vertexAttribOffset
+ i
;
91 simd4scalar attrib
[3]; // triangle attribs (always 4 wide)
92 float* pAttribStart
= pBuffer
;
94 if (HasConstantInterpT::value
|| IsDegenerate::value
)
96 if (CheckBit(constantInterpMask
, i
))
99 uint32_t adjustedTriIndex
;
100 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
101 static const int32_t quadProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
102 static const uint32_t quadProvokingVertex
[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
103 static const int32_t qstripProvokingTri
[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
104 static const uint32_t qstripProvokingVertex
[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
108 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
109 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
112 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
113 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
115 case TOP_TRIANGLE_STRIP
:
116 adjustedTriIndex
= triIndex
;
118 ? tristripProvokingVertex
[provokingVertex
]
122 adjustedTriIndex
= triIndex
;
123 vid
= provokingVertex
;
127 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
129 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
131 SIMD128::store_ps(pBuffer
, attrib
[vid
]);
137 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
139 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
141 SIMD128::store_ps(pBuffer
, attrib
[i
]);
148 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
150 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
152 SIMD128::store_ps(pBuffer
, attrib
[i
]);
157 // pad out the attrib buffer to 3 verts to ensure the triangle
158 // interpolation code in the pixel shader works correctly for the
159 // 3 topologies - point, line, tri. This effectively zeros out the
160 // effect of the missing vertices in the triangle interpolation.
161 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
163 SIMD128::store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
167 // check for constant source overrides
168 if (IsSwizzledT::value
)
170 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
174 while (_BitScanForward(&comp
, mask
))
176 mask
&= ~(1 << comp
);
178 float constantValue
= 0.0f
;
179 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
181 case SWR_CONSTANT_SOURCE_CONST_0000
:
182 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
183 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
184 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
186 case SWR_CONSTANT_SOURCE_PRIM_ID
:
187 constantValue
= *(float*)&primId
;
191 // apply constant value to all 3 vertices
192 for (uint32_t v
= 0; v
< 3; ++v
)
194 pAttribStart
[comp
+ v
* 4] = constantValue
;
202 //////////////////////////////////////////////////////////////////////////
203 /// @brief Gather scissor rect data based on per-prim viewport indices.
204 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
205 /// @param pViewportIndex - array of per-primitive vewport indexes.
206 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
207 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
208 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
209 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
211 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
212 template<size_t SimdWidth
>
213 struct GatherScissors
215 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
216 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
217 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
219 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
224 struct GatherScissors
<8>
226 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
227 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
228 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
230 scisXmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
231 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
232 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
233 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
234 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
235 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
236 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
237 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
);
238 scisYmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
239 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
240 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
241 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
242 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
243 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
244 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
245 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
);
246 scisXmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
247 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
248 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
249 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
250 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
251 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
252 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
253 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
);
254 scisYmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
255 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
256 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
257 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
258 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
259 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
260 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
261 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
);
265 #if USE_SIMD16_FRONTEND
266 template<size_t SimdWidth
>
267 struct GatherScissors_simd16
269 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
270 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
271 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
)
273 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
278 struct GatherScissors_simd16
<16>
280 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
281 simd16scalari
&scisXmin
, simd16scalari
&scisYmin
,
282 simd16scalari
&scisXmax
, simd16scalari
&scisYmax
) {
283 scisXmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
284 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
285 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
286 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
287 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
288 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
289 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
290 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
,
291 pScissorsInFixedPoint
[pViewportIndex
[8]].xmin
,
292 pScissorsInFixedPoint
[pViewportIndex
[9]].xmin
,
293 pScissorsInFixedPoint
[pViewportIndex
[10]].xmin
,
294 pScissorsInFixedPoint
[pViewportIndex
[11]].xmin
,
295 pScissorsInFixedPoint
[pViewportIndex
[12]].xmin
,
296 pScissorsInFixedPoint
[pViewportIndex
[13]].xmin
,
297 pScissorsInFixedPoint
[pViewportIndex
[14]].xmin
,
298 pScissorsInFixedPoint
[pViewportIndex
[15]].xmin
);
300 scisYmin
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
301 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
302 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
303 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
304 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
305 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
306 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
307 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
,
308 pScissorsInFixedPoint
[pViewportIndex
[8]].ymin
,
309 pScissorsInFixedPoint
[pViewportIndex
[9]].ymin
,
310 pScissorsInFixedPoint
[pViewportIndex
[10]].ymin
,
311 pScissorsInFixedPoint
[pViewportIndex
[11]].ymin
,
312 pScissorsInFixedPoint
[pViewportIndex
[12]].ymin
,
313 pScissorsInFixedPoint
[pViewportIndex
[13]].ymin
,
314 pScissorsInFixedPoint
[pViewportIndex
[14]].ymin
,
315 pScissorsInFixedPoint
[pViewportIndex
[15]].ymin
);
317 scisXmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
318 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
319 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
320 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
321 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
322 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
323 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
324 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
,
325 pScissorsInFixedPoint
[pViewportIndex
[8]].xmax
,
326 pScissorsInFixedPoint
[pViewportIndex
[9]].xmax
,
327 pScissorsInFixedPoint
[pViewportIndex
[10]].xmax
,
328 pScissorsInFixedPoint
[pViewportIndex
[11]].xmax
,
329 pScissorsInFixedPoint
[pViewportIndex
[12]].xmax
,
330 pScissorsInFixedPoint
[pViewportIndex
[13]].xmax
,
331 pScissorsInFixedPoint
[pViewportIndex
[14]].xmax
,
332 pScissorsInFixedPoint
[pViewportIndex
[15]].xmax
);
334 scisYmax
= _simd16_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
335 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
336 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
337 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
338 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
339 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
340 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
341 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
,
342 pScissorsInFixedPoint
[pViewportIndex
[8]].ymax
,
343 pScissorsInFixedPoint
[pViewportIndex
[9]].ymax
,
344 pScissorsInFixedPoint
[pViewportIndex
[10]].ymax
,
345 pScissorsInFixedPoint
[pViewportIndex
[11]].ymax
,
346 pScissorsInFixedPoint
[pViewportIndex
[12]].ymax
,
347 pScissorsInFixedPoint
[pViewportIndex
[13]].ymax
,
348 pScissorsInFixedPoint
[pViewportIndex
[14]].ymax
,
349 pScissorsInFixedPoint
[pViewportIndex
[15]].ymax
);
354 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
356 struct ProcessAttributesChooser
358 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
360 template <typename
... ArgsB
>
361 static FuncType
GetFunc()
363 return ProcessAttributes
<ArgsB
...>;
367 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
369 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
372 //////////////////////////////////////////////////////////////////////////
373 /// @brief Processes enabled user clip distances. Loads the active clip
374 /// distances from the PA, sets up barycentric equations, and
375 /// stores the results to the output buffer
376 /// @param pa - Primitive Assembly state
377 /// @param primIndex - primitive index to process
378 /// @param clipDistMask - mask of enabled clip distances
379 /// @param pUserClipBuffer - buffer to store results
380 template<uint32_t NumVerts
>
381 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float *pRecipW
, float* pUserClipBuffer
)
384 while (_BitScanForward(&clipDist
, clipDistMask
))
386 clipDistMask
&= ~(1 << clipDist
);
387 uint32_t clipSlot
= clipDist
>> 2;
388 uint32_t clipComp
= clipDist
& 0x3;
389 uint32_t clipAttribSlot
= clipSlot
== 0 ?
390 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
392 simd4scalar primClipDist
[3];
393 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
395 float vertClipDist
[NumVerts
];
396 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
398 OSALIGNSIMD(float) aVertClipDist
[4];
399 SIMD128::store_ps(aVertClipDist
, primClipDist
[e
]);
400 vertClipDist
[e
] = aVertClipDist
[clipComp
];
403 // setup plane equations for barycentric interpolation in the backend
404 float baryCoeff
[NumVerts
];
405 float last
= vertClipDist
[NumVerts
- 1] * pRecipW
[NumVerts
- 1];
406 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
408 baryCoeff
[e
] = vertClipDist
[e
] * pRecipW
[e
] - last
;
410 baryCoeff
[NumVerts
- 1] = last
;
412 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
414 *(pUserClipBuffer
++) = baryCoeff
[e
];
419 //////////////////////////////////////////////////////////////////////////
420 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
421 /// culling, viewport transform, etc.
422 /// @param pDC - pointer to draw context.
423 /// @param pa - The primitive assembly object.
424 /// @param workerId - thread's worker id. Even thread has a unique id.
425 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
426 /// @param primID - Primitive ID for each triangle.
427 /// @param viewportIdx - viewport array index for each triangle.
428 /// @tparam CT - ConservativeRastFETraits
429 template <typename CT
>
438 SWR_CONTEXT
*pContext
= pDC
->pContext
;
440 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
442 const API_STATE
& state
= GetApiState(pDC
);
443 const SWR_RASTSTATE
& rastState
= state
.rastState
;
444 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
445 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
447 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
448 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
449 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
451 // Read viewport array index if needed
452 simdscalari viewportIdx
= _simd_set1_epi32(0);
453 if (state
.backendState
.readViewportArrayIndex
)
455 simdvector vpiAttrib
[3];
456 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
458 // OOB indices => forced to zero.
459 simdscalari vpai
= _simd_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
460 vpai
= _simd_max_epi32(_simd_setzero_si(), vpai
);
461 simdscalari vNumViewports
= _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
462 simdscalari vClearMask
= _simd_cmplt_epi32(vpai
, vNumViewports
);
463 viewportIdx
= _simd_and_si(vClearMask
, vpai
);
466 if (feState
.vpTransformDisable
)
468 // RHW is passed in directly when VP transform is disabled
469 vRecipW0
= tri
[0].v
[3];
470 vRecipW1
= tri
[1].v
[3];
471 vRecipW2
= tri
[2].v
[3];
475 // Perspective divide
476 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
477 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
478 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
480 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
481 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
482 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
484 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
485 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
486 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
488 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
489 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
490 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
492 // Viewport transform to screen space coords
493 if (state
.backendState
.readViewportArrayIndex
)
495 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
499 viewportTransform
<3>(tri
, state
.vpMatrices
);
503 // Adjust for pixel center location
504 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
505 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
506 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
508 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
509 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
511 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
512 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
514 simdscalari vXi
[3], vYi
[3];
515 // Set vXi, vYi to required fixed point precision
516 FPToFixedPoint(tri
, vXi
, vYi
);
519 simdscalari vAi
[3], vBi
[3];
520 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
524 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
527 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
528 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
530 int cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
532 uint32_t origTriMask
= triMask
;
533 // don't cull degenerate triangles if we're conservatively rasterizing
534 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
536 triMask
&= ~cullZeroAreaMask
;
539 // determine front winding tris
542 // 0 area triangles are marked as backfacing regardless of winding order,
543 // which is required behavior for conservative rast and wireframe rendering
544 uint32_t frontWindingTris
;
545 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
547 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
548 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
552 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[0])));
553 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet
[1])));
555 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
559 switch ((SWR_CULLMODE
)rastState
.cullMode
)
561 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
562 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
563 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
564 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
565 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
566 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
569 triMask
&= ~cullTris
;
571 if (origTriMask
^ triMask
)
573 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
576 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
577 // compute per tri backface
578 uint32_t frontFaceMask
= frontWindingTris
;
579 uint32_t *pPrimID
= (uint32_t *)&primID
;
580 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
583 PFN_WORK_FUNC pfnWork
;
584 if (CT::IsConservativeT::value
)
586 // determine which edges of the degenerate tri, if any, are valid to rasterize.
587 // used to call the appropriate templated rasterizer function
588 if (cullZeroAreaMask
> 0)
591 simdscalari x0x1Mask
= _simd_cmpeq_epi32(vXi
[0], vXi
[1]);
592 simdscalari y0y1Mask
= _simd_cmpeq_epi32(vYi
[0], vYi
[1]);
593 uint32_t e0Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask
, y0y1Mask
)));
596 simdscalari x1x2Mask
= _simd_cmpeq_epi32(vXi
[1], vXi
[2]);
597 simdscalari y1y2Mask
= _simd_cmpeq_epi32(vYi
[1], vYi
[2]);
598 uint32_t e1Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask
, y1y2Mask
)));
601 // if v0 == v1 & v1 == v2, v0 == v2
602 uint32_t e2Mask
= e0Mask
& e1Mask
;
603 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
605 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
606 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
607 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
608 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
609 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
610 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
611 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
613 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
617 edgeEnable
= 0x00FFFFFF;
622 // degenerate triangles won't be sent to rasterizer; just enable all edges
623 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
624 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
631 goto endBinTriangles
;
634 // Calc bounding box of triangles
635 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
637 // determine if triangle falls between pixel centers and discard
638 // only discard for non-MSAA case and when conservative rast is disabled
639 // (xmin + 127) & ~255
640 // (xmax + 128) & ~255
641 if((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
642 (!CT::IsConservativeT::value
))
644 origTriMask
= triMask
;
648 simdscalari xmin
= _simd_add_epi32(bbox
.xmin
, _simd_set1_epi32(127));
649 xmin
= _simd_and_si(xmin
, _simd_set1_epi32(~255));
650 simdscalari xmax
= _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(128));
651 xmax
= _simd_and_si(xmax
, _simd_set1_epi32(~255));
653 simdscalari vMaskH
= _simd_cmpeq_epi32(xmin
, xmax
);
655 simdscalari ymin
= _simd_add_epi32(bbox
.ymin
, _simd_set1_epi32(127));
656 ymin
= _simd_and_si(ymin
, _simd_set1_epi32(~255));
657 simdscalari ymax
= _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(128));
658 ymax
= _simd_and_si(ymax
, _simd_set1_epi32(~255));
660 simdscalari vMaskV
= _simd_cmpeq_epi32(ymin
, ymax
);
661 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
662 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
665 triMask
&= ~cullCenterMask
;
667 if (origTriMask
^ triMask
)
669 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
673 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
674 // Gather the AOS effective scissor rects based on the per-prim VP index.
675 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
677 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
678 if (state
.backendState
.readViewportArrayIndex
)
680 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
681 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
683 else // broadcast fast path for non-VPAI case.
685 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
686 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
687 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
688 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
691 // Make triangle bbox inclusive
692 bbox
.xmax
= _simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1));
693 bbox
.ymax
= _simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1));
695 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
696 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
697 bbox
.xmax
= _simd_min_epi32(bbox
.xmax
, scisXmax
);
698 bbox
.ymax
= _simd_min_epi32(bbox
.ymax
, scisYmax
);
701 if (CT::IsConservativeT::value
)
703 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
704 // some area. Bump the xmax/ymax edges out
705 simdscalari topEqualsBottom
= _simd_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
706 bbox
.ymax
= _simd_blendv_epi32(bbox
.ymax
, _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(1)), topEqualsBottom
);
707 simdscalari leftEqualsRight
= _simd_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
708 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(1)), leftEqualsRight
);
711 // Cull tris completely outside scissor
713 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
714 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
715 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
716 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
717 triMask
= triMask
& ~maskOutsideScissor
;
722 // Send surviving triangles to the line or point binner based on fill mode
723 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
725 // Simple non-conformant wireframe mode, useful for debugging.
726 // Construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
728 simdscalar recipW
[2];
731 recipW
[0] = vRecipW0
;
732 recipW
[1] = vRecipW1
;
733 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
737 recipW
[0] = vRecipW1
;
738 recipW
[1] = vRecipW2
;
739 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
743 recipW
[0] = vRecipW2
;
744 recipW
[1] = vRecipW0
;
745 BinPostSetupLines(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
747 AR_END(FEBinTriangles
, 1);
750 else if (rastState
.fillMode
== SWR_FILLMODE_POINT
)
753 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[0], triMask
, primID
, viewportIdx
);
754 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[1], triMask
, primID
, viewportIdx
);
755 BinPostSetupPoints(pDC
, pa
, workerId
, &tri
[2], triMask
, primID
, viewportIdx
);
759 // Convert triangle bbox to macrotile units.
760 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
761 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
762 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
763 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
765 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
766 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
767 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
768 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
769 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
771 // transpose verts needed for backend
772 /// @todo modify BE to take non-transformed verts
773 simd4scalar vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
774 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
775 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
776 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
777 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
779 // store render target array index
780 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
781 if (state
.backendState
.readRenderTargetArrayIndex
)
784 pa
.Assemble(VERTEX_SGV_SLOT
, vRtai
);
786 vRtaii
= _simd_castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
787 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
791 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
794 // scan remaining valid triangles and bin each separately
795 while (_BitScanForward(&triIndex
, triMask
))
797 uint32_t linkageCount
= state
.backendState
.numAttributes
;
798 uint32_t numScalarAttribs
= linkageCount
* 4;
804 if (CT::IsConservativeT::value
)
806 // only rasterize valid edges if we have a degenerate primitive
807 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
808 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
809 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
811 // Degenerate triangles are required to be constant interpolated
812 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
816 isDegenerate
= false;
817 work
.pfnWork
= pfnWork
;
820 // Select attribute processor
821 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
822 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
824 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
826 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
827 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
828 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
830 auto pArena
= pDC
->pArena
;
831 SWR_ASSERT(pArena
!= nullptr);
833 // store active attribs
834 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
835 desc
.pAttribs
= pAttribs
;
836 desc
.numAttribs
= linkageCount
;
837 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
839 // store triangle vertex data
840 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
842 SIMD128::store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
843 SIMD128::store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
844 SIMD128::store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
845 SIMD128::store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
847 // store user clip distances
848 if (rastState
.clipDistanceMask
)
850 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
851 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
852 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
855 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
857 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
859 #if KNOB_ENABLE_TOSS_POINTS
860 if (!KNOB_TOSS_SETUP_TRIS
)
863 pTileMgr
->enqueue(x
, y
, &work
);
867 triMask
&= ~(1 << triIndex
);
870 AR_END(FEBinTriangles
, 1);
873 #if USE_SIMD16_FRONTEND
874 template <typename CT
>
875 void SIMDCALL
BinTriangles_simd16(
881 simd16scalari primID
)
883 SWR_CONTEXT
*pContext
= pDC
->pContext
;
885 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
887 const API_STATE
& state
= GetApiState(pDC
);
888 const SWR_RASTSTATE
& rastState
= state
.rastState
;
889 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
891 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
893 simd16scalar vRecipW0
= _simd16_set1_ps(1.0f
);
894 simd16scalar vRecipW1
= _simd16_set1_ps(1.0f
);
895 simd16scalar vRecipW2
= _simd16_set1_ps(1.0f
);
897 simd16scalari viewportIdx
= _simd16_set1_epi32(0);
898 if (state
.backendState
.readViewportArrayIndex
)
900 simd16vector vpiAttrib
[3];
901 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, vpiAttrib
);
903 // OOB indices => forced to zero.
904 simd16scalari vpai
= _simd16_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
905 vpai
= _simd16_max_epi32(_simd16_setzero_si(), vpai
);
906 simd16scalari vNumViewports
= _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
907 simd16scalari vClearMask
= _simd16_cmplt_epi32(vpai
, vNumViewports
);
908 viewportIdx
= _simd16_and_si(vClearMask
, vpai
);
911 if (feState
.vpTransformDisable
)
913 // RHW is passed in directly when VP transform is disabled
914 vRecipW0
= tri
[0].v
[3];
915 vRecipW1
= tri
[1].v
[3];
916 vRecipW2
= tri
[2].v
[3];
920 // Perspective divide
921 vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[0].w
);
922 vRecipW1
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[1].w
);
923 vRecipW2
= _simd16_div_ps(_simd16_set1_ps(1.0f
), tri
[2].w
);
925 tri
[0].v
[0] = _simd16_mul_ps(tri
[0].v
[0], vRecipW0
);
926 tri
[1].v
[0] = _simd16_mul_ps(tri
[1].v
[0], vRecipW1
);
927 tri
[2].v
[0] = _simd16_mul_ps(tri
[2].v
[0], vRecipW2
);
929 tri
[0].v
[1] = _simd16_mul_ps(tri
[0].v
[1], vRecipW0
);
930 tri
[1].v
[1] = _simd16_mul_ps(tri
[1].v
[1], vRecipW1
);
931 tri
[2].v
[1] = _simd16_mul_ps(tri
[2].v
[1], vRecipW2
);
933 tri
[0].v
[2] = _simd16_mul_ps(tri
[0].v
[2], vRecipW0
);
934 tri
[1].v
[2] = _simd16_mul_ps(tri
[1].v
[2], vRecipW1
);
935 tri
[2].v
[2] = _simd16_mul_ps(tri
[2].v
[2], vRecipW2
);
937 // Viewport transform to screen space coords
938 if (state
.backendState
.readViewportArrayIndex
)
940 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
944 viewportTransform
<3>(tri
, state
.vpMatrices
);
948 // Adjust for pixel center location
949 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
951 tri
[0].x
= _simd16_add_ps(tri
[0].x
, offset
);
952 tri
[0].y
= _simd16_add_ps(tri
[0].y
, offset
);
954 tri
[1].x
= _simd16_add_ps(tri
[1].x
, offset
);
955 tri
[1].y
= _simd16_add_ps(tri
[1].y
, offset
);
957 tri
[2].x
= _simd16_add_ps(tri
[2].x
, offset
);
958 tri
[2].y
= _simd16_add_ps(tri
[2].y
, offset
);
960 simd16scalari vXi
[3], vYi
[3];
962 // Set vXi, vYi to required fixed point precision
963 FPToFixedPoint(tri
, vXi
, vYi
);
966 simd16scalari vAi
[3], vBi
[3];
967 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
970 simd16scalari vDet
[2];
971 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
974 uint32_t maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[0], _simd16_setzero_si())));
975 uint32_t maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet
[1], _simd16_setzero_si())));
977 uint32_t cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
979 // don't cull degenerate triangles if we're conservatively rasterizing
980 uint32_t origTriMask
= triMask
;
981 if (rastState
.fillMode
== SWR_FILLMODE_SOLID
&& !CT::IsConservativeT::value
)
983 triMask
&= ~cullZeroAreaMask
;
986 // determine front winding tris
989 // 0 area triangles are marked as backfacing regardless of winding order,
990 // which is required behavior for conservative rast and wireframe rendering
991 uint32_t frontWindingTris
;
992 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
994 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[0], _simd16_setzero_si())));
995 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet
[1], _simd16_setzero_si())));
999 maskLo
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[0])));
1000 maskHi
= _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet
[1])));
1002 frontWindingTris
= maskLo
| (maskHi
<< (KNOB_SIMD16_WIDTH
/ 2));
1006 switch ((SWR_CULLMODE
)rastState
.cullMode
)
1008 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
1009 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
1010 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
1011 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1012 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
1013 default: SWR_INVALID("Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
1016 triMask
&= ~cullTris
;
1018 if (origTriMask
^ triMask
)
1020 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1023 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1024 // compute per tri backface
1025 uint32_t frontFaceMask
= frontWindingTris
;
1026 uint32_t *pPrimID
= (uint32_t *)&primID
;
1027 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1030 uint32_t edgeEnable
;
1031 PFN_WORK_FUNC pfnWork
;
1032 if (CT::IsConservativeT::value
)
1034 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1035 // used to call the appropriate templated rasterizer function
1036 if (cullZeroAreaMask
> 0)
1039 const simd16scalari x0x1Mask
= _simd16_cmpeq_epi32(vXi
[0], vXi
[1]);
1040 const simd16scalari y0y1Mask
= _simd16_cmpeq_epi32(vYi
[0], vYi
[1]);
1042 uint32_t e0Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask
, y0y1Mask
)));
1045 const simd16scalari x1x2Mask
= _simd16_cmpeq_epi32(vXi
[1], vXi
[2]);
1046 const simd16scalari y1y2Mask
= _simd16_cmpeq_epi32(vYi
[1], vYi
[2]);
1048 uint32_t e1Mask
= _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask
, y1y2Mask
)));
1051 // if v0 == v1 & v1 == v2, v0 == v2
1052 uint32_t e2Mask
= e0Mask
& e1Mask
;
1053 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
1055 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1056 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1057 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
1059 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1060 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
1062 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1063 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
1065 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
1069 edgeEnable
= 0x00FFFFFF;
1074 // degenerate triangles won't be sent to rasterizer; just enable all edges
1075 pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1076 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(ALL_EDGES_VALID
), (state
.scissorsTileAligned
== false));
1083 goto endBinTriangles
;
1086 // Calc bounding box of triangles
1087 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
1089 // determine if triangle falls between pixel centers and discard
1090 // only discard for non-MSAA case and when conservative rast is disabled
1091 // (xmin + 127) & ~255
1092 // (xmax + 128) & ~255
1093 if ((rastState
.sampleCount
== SWR_MULTISAMPLE_1X
|| rastState
.bIsCenterPattern
) &&
1094 (!CT::IsConservativeT::value
))
1096 origTriMask
= triMask
;
1101 simd16scalari xmin
= _simd16_add_epi32(bbox
.xmin
, _simd16_set1_epi32(127));
1102 xmin
= _simd16_and_si(xmin
, _simd16_set1_epi32(~255));
1103 simd16scalari xmax
= _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(128));
1104 xmax
= _simd16_and_si(xmax
, _simd16_set1_epi32(~255));
1106 simd16scalari vMaskH
= _simd16_cmpeq_epi32(xmin
, xmax
);
1108 simd16scalari ymin
= _simd16_add_epi32(bbox
.ymin
, _simd16_set1_epi32(127));
1109 ymin
= _simd16_and_si(ymin
, _simd16_set1_epi32(~255));
1110 simd16scalari ymax
= _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(128));
1111 ymax
= _simd16_and_si(ymax
, _simd16_set1_epi32(~255));
1113 simd16scalari vMaskV
= _simd16_cmpeq_epi32(ymin
, ymax
);
1115 vMaskV
= _simd16_or_si(vMaskH
, vMaskV
);
1116 cullCenterMask
= _simd16_movemask_ps(_simd16_castsi_ps(vMaskV
));
1119 triMask
&= ~cullCenterMask
;
1121 if (origTriMask
^ triMask
)
1123 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1127 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1128 // Gather the AOS effective scissor rects based on the per-prim VP index.
1129 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1131 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1133 if (state
.backendState
.readViewportArrayIndex
)
1135 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1136 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1138 else // broadcast fast path for non-VPAI case.
1140 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1141 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1142 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1143 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1146 // Make triangle bbox inclusive
1147 bbox
.xmax
= _simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1));
1148 bbox
.ymax
= _simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1));
1150 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
1151 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
1152 bbox
.xmax
= _simd16_min_epi32(bbox
.xmax
, scisXmax
);
1153 bbox
.ymax
= _simd16_min_epi32(bbox
.ymax
, scisYmax
);
1156 if (CT::IsConservativeT::value
)
1158 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1159 // some area. Bump the xmax/ymax edges out
1160 simd16scalari topEqualsBottom
= _simd16_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
1161 bbox
.ymax
= _simd16_blendv_epi32(bbox
.ymax
, _simd16_add_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), topEqualsBottom
);
1162 simd16scalari leftEqualsRight
= _simd16_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
1163 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, _simd16_add_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), leftEqualsRight
);
1166 // Cull tris completely outside scissor
1168 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1169 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1170 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1171 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
1172 triMask
= triMask
& ~maskOutsideScissor
;
1177 // Send surviving triangles to the line or point binner based on fill mode
1178 if (rastState
.fillMode
== SWR_FILLMODE_WIREFRAME
)
1180 // Simple non-conformant wireframe mode, useful for debugging
1181 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1182 simd16vector line
[2];
1183 simd16scalar recipW
[2];
1186 recipW
[0] = vRecipW0
;
1187 recipW
[1] = vRecipW1
;
1188 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1192 recipW
[0] = vRecipW1
;
1193 recipW
[1] = vRecipW2
;
1194 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1198 recipW
[0] = vRecipW2
;
1199 recipW
[1] = vRecipW0
;
1200 BinPostSetupLines_simd16(pDC
, pa
, workerId
, line
, recipW
, triMask
, primID
, viewportIdx
);
1202 AR_END(FEBinTriangles
, 1);
1205 else if (rastState
.fillMode
== SWR_FILLMODE_POINT
)
1208 BinPostSetupPoints_simd16(pDC
, pa
, workerId
, &tri
[0], triMask
, primID
, viewportIdx
);
1209 BinPostSetupPoints_simd16(pDC
, pa
, workerId
, &tri
[1], triMask
, primID
, viewportIdx
);
1210 BinPostSetupPoints_simd16(pDC
, pa
, workerId
, &tri
[2], triMask
, primID
, viewportIdx
);
1214 // Convert triangle bbox to macrotile units.
1215 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1216 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1217 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1218 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1220 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
1222 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
1223 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
1224 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
1225 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
1227 // transpose verts needed for backend
1228 /// @todo modify BE to take non-transformed verts
1229 simd4scalar vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1230 simd4scalar vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1231 simd4scalar vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1232 simd4scalar vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
1234 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(tri
[0].x
, 0), _simd16_extract_ps(tri
[1].x
, 0), _simd16_extract_ps(tri
[2].x
, 0));
1235 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(tri
[0].y
, 0), _simd16_extract_ps(tri
[1].y
, 0), _simd16_extract_ps(tri
[2].y
, 0));
1236 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(tri
[0].z
, 0), _simd16_extract_ps(tri
[1].z
, 0), _simd16_extract_ps(tri
[2].z
, 0));
1237 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), _simd16_extract_ps(vRecipW2
, 0));
1239 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(tri
[0].x
, 1), _simd16_extract_ps(tri
[1].x
, 1), _simd16_extract_ps(tri
[2].x
, 1));
1240 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(tri
[0].y
, 1), _simd16_extract_ps(tri
[1].y
, 1), _simd16_extract_ps(tri
[2].y
, 1));
1241 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(tri
[0].z
, 1), _simd16_extract_ps(tri
[1].z
, 1), _simd16_extract_ps(tri
[2].z
, 1));
1242 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), _simd16_extract_ps(vRecipW2
, 1));
1244 // store render target array index
1245 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1246 if (state
.backendState
.readRenderTargetArrayIndex
)
1248 simd16vector vRtai
[3];
1249 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, vRtai
);
1250 simd16scalari vRtaii
;
1251 vRtaii
= _simd16_castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
1252 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1256 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1260 // scan remaining valid triangles and bin each separately
1261 while (_BitScanForward(&triIndex
, triMask
))
1263 uint32_t linkageCount
= state
.backendState
.numAttributes
;
1264 uint32_t numScalarAttribs
= linkageCount
* 4;
1270 if (CT::IsConservativeT::value
)
1272 // only rasterize valid edges if we have a degenerate primitive
1273 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
1274 work
.pfnWork
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, (rastState
.conservativeRast
> 0),
1275 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, EdgeValToEdgeState(triEdgeEnable
), (state
.scissorsTileAligned
== false));
1277 // Degenerate triangles are required to be constant interpolated
1278 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
1282 isDegenerate
= false;
1283 work
.pfnWork
= pfnWork
;
1286 // Select attribute processor
1287 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
1288 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
1290 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1292 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
1293 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
1294 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
1296 auto pArena
= pDC
->pArena
;
1297 SWR_ASSERT(pArena
!= nullptr);
1299 // store active attribs
1300 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1301 desc
.pAttribs
= pAttribs
;
1302 desc
.numAttribs
= linkageCount
;
1303 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
1305 // store triangle vertex data
1306 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1309 const uint32_t i
= triIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
1310 const uint32_t j
= triIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
1312 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
1313 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
1314 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
1315 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
1318 // store user clip distances
1319 if (rastState
.clipDistanceMask
)
1321 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1322 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1323 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
1326 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
1328 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
1330 #if KNOB_ENABLE_TOSS_POINTS
1331 if (!KNOB_TOSS_SETUP_TRIS
)
1334 pTileMgr
->enqueue(x
, y
, &work
);
1339 triMask
&= ~(1 << triIndex
);
1342 AR_END(FEBinTriangles
, 1);
1346 struct FEBinTrianglesChooser
1348 typedef PFN_PROCESS_PRIMS FuncType
;
1350 template <typename
... ArgsB
>
1351 static FuncType
GetFunc()
1353 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
1357 // Selector for correct templated BinTrinagles function
1358 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
1360 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
1363 #if USE_SIMD16_FRONTEND
1364 struct FEBinTrianglesChooser_simd16
1366 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType
;
1368 template <typename
... ArgsB
>
1369 static FuncType
GetFunc()
1371 return BinTriangles_simd16
<ConservativeRastFETraits
<ArgsB
...>>;
1375 // Selector for correct templated BinTrinagles function
1376 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
)
1378 return TemplateArgUnroller
<FEBinTrianglesChooser_simd16
>::GetFunc(IsConservative
);
1383 void BinPostSetupPoints(
1390 simdscalari viewportIdx
)
1392 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1394 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1396 simdvector
& primVerts
= prim
[0];
1398 const API_STATE
& state
= GetApiState(pDC
);
1399 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1400 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1402 // Select attribute processor
1403 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1404 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1406 // convert to fixed point
1407 simdscalari vXi
, vYi
;
1408 vXi
= fpToFixedPointVertical(primVerts
.x
);
1409 vYi
= fpToFixedPointVertical(primVerts
.y
);
1411 if (CanUseSimplePoints(pDC
))
1413 // adjust for ymin-xmin rule
1414 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
1415 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
1417 // cull points off the ymin-xmin edge of the viewport
1418 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
1419 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
1421 // compute macro tile coordinates
1422 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1423 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1425 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
1426 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
1427 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
1429 // compute raster tile coordinates
1430 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1431 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1433 // compute raster tile relative x,y for coverage mask
1434 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1435 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1437 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1438 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1440 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
1441 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
1442 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
1443 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
1445 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
1446 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
1447 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
1448 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
1450 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
1451 _simd_store_ps((float*)aZ
, primVerts
.z
);
1453 // store render target array index
1454 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1455 if (state
.backendState
.readRenderTargetArrayIndex
)
1458 pa
.Assemble(VERTEX_SGV_SLOT
, &vRtai
);
1459 simdscalari vRtaii
= _simd_castps_si(vRtai
[VERTEX_SGV_RTAI_COMP
]);
1460 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1464 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1467 uint32_t *pPrimID
= (uint32_t *)&primID
;
1468 DWORD primIndex
= 0;
1470 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1472 // scan remaining valid triangles and bin each separately
1473 while (_BitScanForward(&primIndex
, primMask
))
1475 uint32_t linkageCount
= backendState
.numAttributes
;
1476 uint32_t numScalarAttribs
= linkageCount
* 4;
1481 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1483 // points are always front facing
1484 desc
.triFlags
.frontFacing
= 1;
1485 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1486 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1488 work
.pfnWork
= RasterizeSimplePoint
;
1490 auto pArena
= pDC
->pArena
;
1491 SWR_ASSERT(pArena
!= nullptr);
1494 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1495 desc
.pAttribs
= pAttribs
;
1496 desc
.numAttribs
= linkageCount
;
1498 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1500 // store raster tile aligned x, y, perspective correct z
1501 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1502 desc
.pTriBuffer
= pTriBuffer
;
1503 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1504 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1505 *pTriBuffer
= aZ
[primIndex
];
1507 uint32_t tX
= aTileRelativeX
[primIndex
];
1508 uint32_t tY
= aTileRelativeY
[primIndex
];
1510 // pack the relative x,y into the coverageMask, the rasterizer will
1511 // generate the true coverage mask from it
1512 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1515 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1516 #if KNOB_ENABLE_TOSS_POINTS
1517 if (!KNOB_TOSS_SETUP_TRIS
)
1520 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1522 primMask
&= ~(1 << primIndex
);
1527 // non simple points need to be potentially binned to multiple macro tiles
1528 simdscalar vPointSize
;
1529 if (rastState
.pointParam
)
1532 pa
.Assemble(VERTEX_SGV_SLOT
, size
);
1533 vPointSize
= size
[0][VERTEX_SGV_POINT_SIZE_COMP
];
1537 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
1540 // bloat point to bbox
1542 bbox
.xmin
= bbox
.xmax
= vXi
;
1543 bbox
.ymin
= bbox
.ymax
= vYi
;
1545 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
1546 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1547 bbox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1548 bbox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
1549 bbox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1550 bbox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
1552 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1553 // Gather the AOS effective scissor rects based on the per-prim VP index.
1554 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1556 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1557 if (state
.backendState
.readViewportArrayIndex
)
1559 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1560 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1562 else // broadcast fast path for non-VPAI case.
1564 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1565 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1566 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1567 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1570 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
1571 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
1572 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
1573 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
1576 // Cull bloated points completely outside scissor
1577 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1578 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1579 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1580 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1581 primMask
= primMask
& ~maskOutsideScissor
;
1583 // Convert bbox to macrotile units.
1584 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1585 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1586 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1587 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1589 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1590 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
1591 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
1592 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
1593 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
1595 // store render target array index
1596 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1597 if (state
.backendState
.readRenderTargetArrayIndex
)
1599 simdvector vRtai
[2];
1600 pa
.Assemble(VERTEX_SGV_SLOT
, vRtai
);
1601 simdscalari vRtaii
= _simd_castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
1602 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1606 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1609 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
1610 _simd_store_ps((float*)aPointSize
, vPointSize
);
1612 uint32_t *pPrimID
= (uint32_t *)&primID
;
1614 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
1615 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
1616 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
1618 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
1619 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
1620 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
1622 // scan remaining valid prims and bin each separately
1623 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
1625 while (_BitScanForward(&primIndex
, primMask
))
1627 uint32_t linkageCount
= backendState
.numAttributes
;
1628 uint32_t numScalarAttribs
= linkageCount
* 4;
1633 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1635 desc
.triFlags
.frontFacing
= 1;
1636 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
1637 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1638 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1640 work
.pfnWork
= RasterizeTriPoint
;
1642 auto pArena
= pDC
->pArena
;
1643 SWR_ASSERT(pArena
!= nullptr);
1645 // store active attribs
1646 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1647 desc
.numAttribs
= linkageCount
;
1648 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
1650 // store point vertex data
1651 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1652 desc
.pTriBuffer
= pTriBuffer
;
1653 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
1654 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
1655 *pTriBuffer
= aPrimVertsZ
[primIndex
];
1657 // store user clip distances
1658 if (rastState
.clipDistanceMask
)
1660 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1661 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1664 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
1665 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
1666 desc
.pUserClipBuffer
[3*i
+ 0] = 0.0f
;
1667 desc
.pUserClipBuffer
[3*i
+ 1] = 0.0f
;
1668 desc
.pUserClipBuffer
[3*i
+ 2] = dists
[i
];
1672 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1673 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
1675 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
1677 #if KNOB_ENABLE_TOSS_POINTS
1678 if (!KNOB_TOSS_SETUP_TRIS
)
1681 pTileMgr
->enqueue(x
, y
, &work
);
1686 primMask
&= ~(1 << primIndex
);
1690 AR_END(FEBinPoints
, 1);
1693 //////////////////////////////////////////////////////////////////////////
1694 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1695 /// @param pDC - pointer to draw context.
1696 /// @param pa - The primitive assembly object.
1697 /// @param workerId - thread's worker id. Even thread has a unique id.
1698 /// @param tri - Contains point position data for SIMDs worth of points.
1699 /// @param primID - Primitive ID for each point.
1708 simdvector
& primVerts
= prim
[0];
1710 const API_STATE
& state
= GetApiState(pDC
);
1711 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1712 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1714 // Read back viewport index if required
1715 simdscalari viewportIdx
= _simd_set1_epi32(0);
1716 if (state
.backendState
.readViewportArrayIndex
)
1718 simdvector vpiAttrib
[1];
1719 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
1720 simdscalari vpai
= _simd_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1722 // OOB indices => forced to zero.
1723 vpai
= _simd_max_epi32(_simd_setzero_si(), vpai
);
1724 simdscalari vNumViewports
= _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1725 simdscalari vClearMask
= _simd_cmplt_epi32(vpai
, vNumViewports
);
1726 viewportIdx
= _simd_and_si(vClearMask
, vpai
);
1729 if (!feState
.vpTransformDisable
)
1731 // perspective divide
1732 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
1733 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
1734 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
1735 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
1737 // viewport transform to screen coords
1738 if (state
.backendState
.readViewportArrayIndex
)
1740 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
1744 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
1748 // adjust for pixel center location
1749 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1750 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
1751 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
1763 #if USE_SIMD16_FRONTEND
1764 void BinPostSetupPoints_simd16(
1768 simd16vector prim
[],
1770 simd16scalari primID
,
1771 simd16scalari viewportIdx
)
1773 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1775 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
1777 simd16vector
& primVerts
= prim
[0];
1779 const API_STATE
& state
= GetApiState(pDC
);
1780 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1781 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1783 // Select attribute processor
1784 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
1785 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
1787 // convert to fixed point
1788 simd16scalari vXi
, vYi
;
1790 vXi
= fpToFixedPointVertical(primVerts
.x
);
1791 vYi
= fpToFixedPointVertical(primVerts
.y
);
1793 if (CanUseSimplePoints(pDC
))
1795 // adjust for ymin-xmin rule
1796 vXi
= _simd16_sub_epi32(vXi
, _simd16_set1_epi32(1));
1797 vYi
= _simd16_sub_epi32(vYi
, _simd16_set1_epi32(1));
1799 // cull points off the ymin-xmin edge of the viewport
1800 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi
));
1801 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi
));
1803 // compute macro tile coordinates
1804 simd16scalari macroX
= _simd16_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1805 simd16scalari macroY
= _simd16_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1807 OSALIGNSIMD16(uint32_t) aMacroX
[KNOB_SIMD16_WIDTH
], aMacroY
[KNOB_SIMD16_WIDTH
];
1809 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroX
), macroX
);
1810 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMacroY
), macroY
);
1812 // compute raster tile coordinates
1813 simd16scalari rasterX
= _simd16_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1814 simd16scalari rasterY
= _simd16_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1816 // compute raster tile relative x,y for coverage mask
1817 simd16scalari tileAlignedX
= _simd16_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1818 simd16scalari tileAlignedY
= _simd16_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1820 simd16scalari tileRelativeX
= _simd16_sub_epi32(_simd16_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1821 simd16scalari tileRelativeY
= _simd16_sub_epi32(_simd16_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1823 OSALIGNSIMD16(uint32_t) aTileRelativeX
[KNOB_SIMD16_WIDTH
];
1824 OSALIGNSIMD16(uint32_t) aTileRelativeY
[KNOB_SIMD16_WIDTH
];
1826 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeX
), tileRelativeX
);
1827 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileRelativeY
), tileRelativeY
);
1829 OSALIGNSIMD16(uint32_t) aTileAlignedX
[KNOB_SIMD16_WIDTH
];
1830 OSALIGNSIMD16(uint32_t) aTileAlignedY
[KNOB_SIMD16_WIDTH
];
1832 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedX
), tileAlignedX
);
1833 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aTileAlignedY
), tileAlignedY
);
1835 OSALIGNSIMD16(float) aZ
[KNOB_SIMD16_WIDTH
];
1836 _simd16_store_ps(reinterpret_cast<float *>(aZ
), primVerts
.z
);
1838 // store render target array index
1839 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1840 if (state
.backendState
.readRenderTargetArrayIndex
)
1843 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, &vRtai
);
1844 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[VERTEX_SGV_RTAI_COMP
]);
1845 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1849 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1852 uint32_t *pPrimID
= (uint32_t *)&primID
;
1853 DWORD primIndex
= 0;
1855 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1857 // scan remaining valid triangles and bin each separately
1858 while (_BitScanForward(&primIndex
, primMask
))
1860 uint32_t linkageCount
= backendState
.numAttributes
;
1861 uint32_t numScalarAttribs
= linkageCount
* 4;
1866 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1868 // points are always front facing
1869 desc
.triFlags
.frontFacing
= 1;
1870 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1871 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
1873 work
.pfnWork
= RasterizeSimplePoint
;
1875 auto pArena
= pDC
->pArena
;
1876 SWR_ASSERT(pArena
!= nullptr);
1879 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1880 desc
.pAttribs
= pAttribs
;
1881 desc
.numAttribs
= linkageCount
;
1883 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
1885 // store raster tile aligned x, y, perspective correct z
1886 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1887 desc
.pTriBuffer
= pTriBuffer
;
1888 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1889 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1890 *pTriBuffer
= aZ
[primIndex
];
1892 uint32_t tX
= aTileRelativeX
[primIndex
];
1893 uint32_t tY
= aTileRelativeY
[primIndex
];
1895 // pack the relative x,y into the coverageMask, the rasterizer will
1896 // generate the true coverage mask from it
1897 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
1900 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1901 #if KNOB_ENABLE_TOSS_POINTS
1902 if (!KNOB_TOSS_SETUP_TRIS
)
1905 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
1908 primMask
&= ~(1 << primIndex
);
1913 // non simple points need to be potentially binned to multiple macro tiles
1914 simd16scalar vPointSize
;
1916 if (rastState
.pointParam
)
1918 simd16vector size
[3];
1919 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, size
);
1920 vPointSize
= size
[0][VERTEX_SGV_POINT_SIZE_COMP
];
1924 vPointSize
= _simd16_set1_ps(rastState
.pointSize
);
1927 // bloat point to bbox
1930 bbox
.xmin
= bbox
.xmax
= vXi
;
1931 bbox
.ymin
= bbox
.ymax
= vYi
;
1933 simd16scalar vHalfWidth
= _simd16_mul_ps(vPointSize
, _simd16_set1_ps(0.5f
));
1934 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
1936 bbox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
1937 bbox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
1938 bbox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
1939 bbox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
1941 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1942 // Gather the AOS effective scissor rects based on the per-prim VP index.
1943 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1945 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
1946 if (state
.backendState
.readViewportArrayIndex
)
1948 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
1949 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
1951 else // broadcast fast path for non-VPAI case.
1953 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
1954 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
1955 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
1956 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
1959 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
1960 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
1961 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
1962 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
1965 // Cull bloated points completely outside scissor
1966 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
1967 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
1968 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1969 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
1970 primMask
= primMask
& ~maskOutsideScissor
;
1972 // Convert bbox to macrotile units.
1973 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1974 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1975 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1976 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1978 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
1980 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
1981 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
1982 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
1983 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
1985 // store render target array index
1986 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
1987 if (state
.backendState
.readRenderTargetArrayIndex
)
1989 simd16vector vRtai
[2];
1990 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, vRtai
);
1991 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
1992 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
1996 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
1999 OSALIGNSIMD16(float) aPointSize
[KNOB_SIMD16_WIDTH
];
2000 _simd16_store_ps(reinterpret_cast<float *>(aPointSize
), vPointSize
);
2002 uint32_t *pPrimID
= (uint32_t *)&primID
;
2004 OSALIGNSIMD16(float) aPrimVertsX
[KNOB_SIMD16_WIDTH
];
2005 OSALIGNSIMD16(float) aPrimVertsY
[KNOB_SIMD16_WIDTH
];
2006 OSALIGNSIMD16(float) aPrimVertsZ
[KNOB_SIMD16_WIDTH
];
2008 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX
), primVerts
.x
);
2009 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY
), primVerts
.y
);
2010 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ
), primVerts
.z
);
2012 // scan remaining valid prims and bin each separately
2013 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
2015 while (_BitScanForward(&primIndex
, primMask
))
2017 uint32_t linkageCount
= backendState
.numAttributes
;
2018 uint32_t numScalarAttribs
= linkageCount
* 4;
2023 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2025 desc
.triFlags
.frontFacing
= 1;
2026 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
2027 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2028 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2030 work
.pfnWork
= RasterizeTriPoint
;
2032 auto pArena
= pDC
->pArena
;
2033 SWR_ASSERT(pArena
!= nullptr);
2035 // store active attribs
2036 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2037 desc
.numAttribs
= linkageCount
;
2038 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2040 // store point vertex data
2041 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2042 desc
.pTriBuffer
= pTriBuffer
;
2043 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
2044 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
2045 *pTriBuffer
= aPrimVertsZ
[primIndex
];
2047 // store user clip distances
2048 if (rastState
.clipDistanceMask
)
2050 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2051 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
2054 ProcessUserClipDist
<1>(pa
, primIndex
, rastState
.clipDistanceMask
, &one
, dists
);
2055 for (uint32_t i
= 0; i
< numClipDist
; i
++) {
2056 desc
.pUserClipBuffer
[3 * i
+ 0] = 0.0f
;
2057 desc
.pUserClipBuffer
[3 * i
+ 1] = 0.0f
;
2058 desc
.pUserClipBuffer
[3 * i
+ 2] = dists
[i
];
2062 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2063 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2065 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2067 #if KNOB_ENABLE_TOSS_POINTS
2068 if (!KNOB_TOSS_SETUP_TRIS
)
2071 pTileMgr
->enqueue(x
, y
, &work
);
2076 primMask
&= ~(1 << primIndex
);
2080 AR_END(FEBinPoints
, 1);
2083 void SIMDCALL
BinPoints_simd16(
2087 simd16vector prim
[3],
2089 simd16scalari primID
)
2091 simd16vector
& primVerts
= prim
[0];
2093 const API_STATE
& state
= GetApiState(pDC
);
2094 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2095 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2097 // Read back viewport index if required
2098 simd16scalari viewportIdx
= _simd16_set1_epi32(0);
2099 if (state
.backendState
.readViewportArrayIndex
)
2101 simd16vector vpiAttrib
[1];
2102 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, vpiAttrib
);
2104 // OOB indices => forced to zero.
2105 simd16scalari vpai
= _simd16_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2106 vpai
= _simd16_max_epi32(_simd16_setzero_si(), vpai
);
2107 simd16scalari vNumViewports
= _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2108 simd16scalari vClearMask
= _simd16_cmplt_epi32(vpai
, vNumViewports
);
2109 viewportIdx
= _simd16_and_si(vClearMask
, vpai
);
2112 if (!feState
.vpTransformDisable
)
2114 // perspective divide
2115 simd16scalar vRecipW0
= _simd16_div_ps(_simd16_set1_ps(1.0f
), primVerts
.w
);
2117 primVerts
.x
= _simd16_mul_ps(primVerts
.x
, vRecipW0
);
2118 primVerts
.y
= _simd16_mul_ps(primVerts
.y
, vRecipW0
);
2119 primVerts
.z
= _simd16_mul_ps(primVerts
.z
, vRecipW0
);
2121 // viewport transform to screen coords
2122 if (state
.backendState
.readViewportArrayIndex
)
2124 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
2128 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
2132 const simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
2134 primVerts
.x
= _simd16_add_ps(primVerts
.x
, offset
);
2135 primVerts
.y
= _simd16_add_ps(primVerts
.y
, offset
);
2137 BinPostSetupPoints_simd16(
2148 //////////////////////////////////////////////////////////////////////////
2149 /// @brief Bin SIMD lines to the backend.
2150 /// @param pDC - pointer to draw context.
2151 /// @param pa - The primitive assembly object.
2152 /// @param workerId - thread's worker id. Even thread has a unique id.
2153 /// @param tri - Contains line position data for SIMDs worth of points.
2154 /// @param primID - Primitive ID for each line.
2155 /// @param viewportIdx - Viewport Array Index for each line.
2156 void BinPostSetupLines(
2161 simdscalar recipW
[],
2164 simdscalari viewportIdx
)
2166 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2168 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2170 const API_STATE
& state
= GetApiState(pDC
);
2171 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2173 // Select attribute processor
2174 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2175 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2177 simdscalar
& vRecipW0
= recipW
[0];
2178 simdscalar
& vRecipW1
= recipW
[1];
2180 simd4scalar vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2182 // convert to fixed point
2183 simdscalari vXi
[2], vYi
[2];
2184 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2185 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2186 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2187 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2189 // compute x-major vs y-major mask
2190 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
2191 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
2192 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
2193 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
2195 // cull zero-length lines
2196 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
2197 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
2199 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
2201 uint32_t *pPrimID
= (uint32_t *)&primID
;
2202 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2204 simdscalar vUnused
= _simd_setzero_ps();
2206 // Calc bounding box of lines
2208 bbox
.xmin
= _simd_min_epi32(vXi
[0], vXi
[1]);
2209 bbox
.xmax
= _simd_max_epi32(vXi
[0], vXi
[1]);
2210 bbox
.ymin
= _simd_min_epi32(vYi
[0], vYi
[1]);
2211 bbox
.ymax
= _simd_max_epi32(vYi
[0], vYi
[1]);
2213 // bloat bbox by line width along minor axis
2214 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
2215 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2217 bloatBox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2218 bloatBox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
2219 bloatBox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2220 bloatBox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
2222 bbox
.xmin
= _simd_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2223 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2224 bbox
.ymin
= _simd_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2225 bbox
.ymax
= _simd_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2227 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2229 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2230 if (state
.backendState
.readViewportArrayIndex
)
2232 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2233 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2235 else // broadcast fast path for non-VPAI case.
2237 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2238 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2239 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2240 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2243 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
2244 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
2245 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
2246 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
2249 // Cull prims completely outside scissor
2251 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2252 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2253 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2254 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2255 primMask
= primMask
& ~maskOutsideScissor
;
2263 // Convert triangle bbox to macrotile units.
2264 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2265 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2266 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2267 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2269 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2270 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
2271 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
2272 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
2273 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
2275 // transpose verts needed for backend
2276 /// @todo modify BE to take non-transformed verts
2277 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
2278 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
2279 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
2280 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
2282 // store render target array index
2283 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2284 if (state
.backendState
.readRenderTargetArrayIndex
)
2286 simdvector vRtai
[2];
2287 pa
.Assemble(VERTEX_SGV_SLOT
, vRtai
);
2288 simdscalari vRtaii
= _simd_castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
2289 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2293 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2296 // scan remaining valid prims and bin each separately
2298 while (_BitScanForward(&primIndex
, primMask
))
2300 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2301 uint32_t numScalarAttribs
= linkageCount
* 4;
2306 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2308 desc
.triFlags
.frontFacing
= 1;
2309 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2310 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2311 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2313 work
.pfnWork
= RasterizeLine
;
2315 auto pArena
= pDC
->pArena
;
2316 SWR_ASSERT(pArena
!= nullptr);
2318 // store active attribs
2319 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2320 desc
.numAttribs
= linkageCount
;
2321 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2323 // store line vertex data
2324 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2325 SIMD128::store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
2326 SIMD128::store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
2327 SIMD128::store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
2328 SIMD128::store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
2330 // store user clip distances
2331 if (rastState
.clipDistanceMask
)
2333 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2334 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2335 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2338 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2339 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2341 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2343 #if KNOB_ENABLE_TOSS_POINTS
2344 if (!KNOB_TOSS_SETUP_TRIS
)
2347 pTileMgr
->enqueue(x
, y
, &work
);
2352 primMask
&= ~(1 << primIndex
);
2357 AR_END(FEBinLines
, 1);
2360 #if USE_SIMD16_FRONTEND
2361 void BinPostSetupLines_simd16(
2365 simd16vector prim
[],
2366 simd16scalar recipW
[],
2368 simd16scalari primID
,
2369 simd16scalari viewportIdx
)
2371 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2373 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2375 const API_STATE
& state
= GetApiState(pDC
);
2376 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2378 // Select attribute processor
2379 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2380 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2382 simd16scalar
& vRecipW0
= recipW
[0];
2383 simd16scalar
& vRecipW1
= recipW
[1];
2385 // convert to fixed point
2386 simd16scalari vXi
[2], vYi
[2];
2388 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2389 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2390 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2391 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2393 // compute x-major vs y-major mask
2394 simd16scalari xLength
= _simd16_abs_epi32(_simd16_sub_epi32(vXi
[0], vXi
[1]));
2395 simd16scalari yLength
= _simd16_abs_epi32(_simd16_sub_epi32(vYi
[0], vYi
[1]));
2396 simd16scalar vYmajorMask
= _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength
, xLength
));
2397 uint32_t yMajorMask
= _simd16_movemask_ps(vYmajorMask
);
2399 // cull zero-length lines
2400 simd16scalari vZeroLengthMask
= _simd16_cmpeq_epi32(xLength
, _simd16_setzero_si());
2401 vZeroLengthMask
= _simd16_and_si(vZeroLengthMask
, _simd16_cmpeq_epi32(yLength
, _simd16_setzero_si()));
2403 primMask
&= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask
));
2405 uint32_t *pPrimID
= (uint32_t *)&primID
;
2406 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2408 // Calc bounding box of lines
2410 bbox
.xmin
= _simd16_min_epi32(vXi
[0], vXi
[1]);
2411 bbox
.xmax
= _simd16_max_epi32(vXi
[0], vXi
[1]);
2412 bbox
.ymin
= _simd16_min_epi32(vYi
[0], vYi
[1]);
2413 bbox
.ymax
= _simd16_max_epi32(vYi
[0], vYi
[1]);
2415 // bloat bbox by line width along minor axis
2416 simd16scalar vHalfWidth
= _simd16_set1_ps(rastState
.lineWidth
/ 2.0f
);
2417 simd16scalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2419 simd16BBox bloatBox
;
2421 bloatBox
.xmin
= _simd16_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2422 bloatBox
.xmax
= _simd16_add_epi32(bbox
.xmax
, vHalfWidthi
);
2423 bloatBox
.ymin
= _simd16_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2424 bloatBox
.ymax
= _simd16_add_epi32(bbox
.ymax
, vHalfWidthi
);
2426 bbox
.xmin
= _simd16_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2427 bbox
.xmax
= _simd16_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2428 bbox
.ymin
= _simd16_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2429 bbox
.ymax
= _simd16_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2431 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2433 simd16scalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2435 if (state
.backendState
.readViewportArrayIndex
)
2437 GatherScissors_simd16
<KNOB_SIMD16_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2438 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2440 else // broadcast fast path for non-VPAI case.
2442 scisXmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2443 scisYmin
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2444 scisXmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2445 scisYmax
= _simd16_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2448 bbox
.xmin
= _simd16_max_epi32(bbox
.xmin
, scisXmin
);
2449 bbox
.ymin
= _simd16_max_epi32(bbox
.ymin
, scisYmin
);
2450 bbox
.xmax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.xmax
, _simd16_set1_epi32(1)), scisXmax
);
2451 bbox
.ymax
= _simd16_min_epi32(_simd16_sub_epi32(bbox
.ymax
, _simd16_set1_epi32(1)), scisYmax
);
2454 // Cull prims completely outside scissor
2456 simd16scalari maskOutsideScissorX
= _simd16_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2457 simd16scalari maskOutsideScissorY
= _simd16_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2458 simd16scalari maskOutsideScissorXY
= _simd16_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2459 uint32_t maskOutsideScissor
= _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY
));
2460 primMask
= primMask
& ~maskOutsideScissor
;
2463 const simdscalar unused
= _simd_setzero_ps();
2465 // transpose verts needed for backend
2466 /// @todo modify BE to take non-transformed verts
2467 simd4scalar vHorizX
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2468 simd4scalar vHorizY
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2469 simd4scalar vHorizZ
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2470 simd4scalar vHorizW
[2][KNOB_SIMD_WIDTH
]; // KNOB_SIMD16_WIDTH
2477 // Convert triangle bbox to macrotile units.
2478 bbox
.xmin
= _simd16_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2479 bbox
.ymin
= _simd16_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2480 bbox
.xmax
= _simd16_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2481 bbox
.ymax
= _simd16_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2483 OSALIGNSIMD16(uint32_t) aMTLeft
[KNOB_SIMD16_WIDTH
], aMTRight
[KNOB_SIMD16_WIDTH
], aMTTop
[KNOB_SIMD16_WIDTH
], aMTBottom
[KNOB_SIMD16_WIDTH
];
2485 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTLeft
), bbox
.xmin
);
2486 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTRight
), bbox
.xmax
);
2487 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTTop
), bbox
.ymin
);
2488 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aMTBottom
), bbox
.ymax
);
2490 vTranspose3x8(vHorizX
[0], _simd16_extract_ps(prim
[0].x
, 0), _simd16_extract_ps(prim
[1].x
, 0), unused
);
2491 vTranspose3x8(vHorizY
[0], _simd16_extract_ps(prim
[0].y
, 0), _simd16_extract_ps(prim
[1].y
, 0), unused
);
2492 vTranspose3x8(vHorizZ
[0], _simd16_extract_ps(prim
[0].z
, 0), _simd16_extract_ps(prim
[1].z
, 0), unused
);
2493 vTranspose3x8(vHorizW
[0], _simd16_extract_ps(vRecipW0
, 0), _simd16_extract_ps(vRecipW1
, 0), unused
);
2495 vTranspose3x8(vHorizX
[1], _simd16_extract_ps(prim
[0].x
, 1), _simd16_extract_ps(prim
[1].x
, 1), unused
);
2496 vTranspose3x8(vHorizY
[1], _simd16_extract_ps(prim
[0].y
, 1), _simd16_extract_ps(prim
[1].y
, 1), unused
);
2497 vTranspose3x8(vHorizZ
[1], _simd16_extract_ps(prim
[0].z
, 1), _simd16_extract_ps(prim
[1].z
, 1), unused
);
2498 vTranspose3x8(vHorizW
[1], _simd16_extract_ps(vRecipW0
, 1), _simd16_extract_ps(vRecipW1
, 1), unused
);
2500 // store render target array index
2501 OSALIGNSIMD16(uint32_t) aRTAI
[KNOB_SIMD16_WIDTH
];
2502 if (state
.backendState
.readRenderTargetArrayIndex
)
2504 simd16vector vRtai
[2];
2505 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, vRtai
);
2506 simd16scalari vRtaii
= _simd16_castps_si(vRtai
[0][VERTEX_SGV_RTAI_COMP
]);
2507 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), vRtaii
);
2511 _simd16_store_si(reinterpret_cast<simd16scalari
*>(aRTAI
), _simd16_setzero_si());
2514 // scan remaining valid prims and bin each separately
2516 while (_BitScanForward(&primIndex
, primMask
))
2518 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2519 uint32_t numScalarAttribs
= linkageCount
* 4;
2524 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2526 desc
.triFlags
.frontFacing
= 1;
2527 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2528 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2529 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2531 work
.pfnWork
= RasterizeLine
;
2533 auto pArena
= pDC
->pArena
;
2534 SWR_ASSERT(pArena
!= nullptr);
2536 // store active attribs
2537 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2538 desc
.numAttribs
= linkageCount
;
2539 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2541 // store line vertex data
2542 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2545 const uint32_t i
= primIndex
>> 3; // triIndex / KNOB_SIMD_WIDTH
2546 const uint32_t j
= primIndex
& 7; // triIndex % KNOB_SIMD_WIDTH
2548 _mm_store_ps(&desc
.pTriBuffer
[ 0], vHorizX
[i
][j
]);
2549 _mm_store_ps(&desc
.pTriBuffer
[ 4], vHorizY
[i
][j
]);
2550 _mm_store_ps(&desc
.pTriBuffer
[ 8], vHorizZ
[i
][j
]);
2551 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[i
][j
]);
2554 // store user clip distances
2555 if (rastState
.clipDistanceMask
)
2557 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2558 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2559 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, &desc
.pTriBuffer
[12], desc
.pUserClipBuffer
);
2562 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2563 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2565 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2567 #if KNOB_ENABLE_TOSS_POINTS
2568 if (!KNOB_TOSS_SETUP_TRIS
)
2571 pTileMgr
->enqueue(x
, y
, &work
);
2576 primMask
&= ~(1 << primIndex
);
2581 AR_END(FEBinLines
, 1);
2585 //////////////////////////////////////////////////////////////////////////
2586 /// @brief Bin SIMD lines to the backend.
2587 /// @param pDC - pointer to draw context.
2588 /// @param pa - The primitive assembly object.
2589 /// @param workerId - thread's worker id. Even thread has a unique id.
2590 /// @param tri - Contains line position data for SIMDs worth of points.
2591 /// @param primID - Primitive ID for each line.
2592 /// @param viewportIdx - Viewport Array Index for each line.
2601 const API_STATE
& state
= GetApiState(pDC
);
2602 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2603 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2605 simdscalar vRecipW
[2] = { _simd_set1_ps(1.0f
), _simd_set1_ps(1.0f
) };
2607 simdscalari viewportIdx
= _simd_set1_epi32(0);
2608 if (state
.backendState
.readViewportArrayIndex
)
2610 simdvector vpiAttrib
[2];
2611 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
2612 simdscalari vpai
= _simd_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2613 vpai
= _simd_max_epi32(_simd_setzero_si(), vpai
);
2615 // OOB indices => forced to zero.
2616 simdscalari vNumViewports
= _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2617 simdscalari vClearMask
= _simd_cmplt_epi32(vpai
, vNumViewports
);
2618 viewportIdx
= _simd_and_si(vClearMask
, vpai
);
2621 if (!feState
.vpTransformDisable
)
2623 // perspective divide
2624 vRecipW
[0] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
2625 vRecipW
[1] = _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
2627 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2628 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2630 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2631 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2633 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2634 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2636 // viewport transform to screen coords
2637 if (state
.backendState
.readViewportArrayIndex
)
2639 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2643 viewportTransform
<2>(prim
, state
.vpMatrices
);
2647 // adjust for pixel center location
2648 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2649 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
2650 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
2652 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
2653 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);
2666 #if USE_SIMD16_FRONTEND
2667 void SIMDCALL
BinLines_simd16(
2671 simd16vector prim
[3],
2673 simd16scalari primID
)
2675 const API_STATE
& state
= GetApiState(pDC
);
2676 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2677 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2679 simd16scalar vRecipW
[2] = { _simd16_set1_ps(1.0f
), _simd16_set1_ps(1.0f
) };
2681 simd16scalari viewportIdx
= _simd16_set1_epi32(0);
2682 if (state
.backendState
.readViewportArrayIndex
)
2684 simd16vector vpiAttrib
[2];
2685 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, vpiAttrib
);
2687 // OOB indices => forced to zero.
2688 simd16scalari vpai
= _simd16_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2689 vpai
= _simd16_max_epi32(_simd16_setzero_si(), vpai
);
2690 simd16scalari vNumViewports
= _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2691 simd16scalari vClearMask
= _simd16_cmplt_epi32(vpai
, vNumViewports
);
2692 viewportIdx
= _simd16_and_si(vClearMask
, vpai
);
2695 if (!feState
.vpTransformDisable
)
2697 // perspective divide
2698 vRecipW
[0] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[0].w
);
2699 vRecipW
[1] = _simd16_div_ps(_simd16_set1_ps(1.0f
), prim
[1].w
);
2701 prim
[0].v
[0] = _simd16_mul_ps(prim
[0].v
[0], vRecipW
[0]);
2702 prim
[1].v
[0] = _simd16_mul_ps(prim
[1].v
[0], vRecipW
[1]);
2704 prim
[0].v
[1] = _simd16_mul_ps(prim
[0].v
[1], vRecipW
[0]);
2705 prim
[1].v
[1] = _simd16_mul_ps(prim
[1].v
[1], vRecipW
[1]);
2707 prim
[0].v
[2] = _simd16_mul_ps(prim
[0].v
[2], vRecipW
[0]);
2708 prim
[1].v
[2] = _simd16_mul_ps(prim
[1].v
[2], vRecipW
[1]);
2710 // viewport transform to screen coords
2711 if (state
.backendState
.readViewportArrayIndex
)
2713 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2717 viewportTransform
<2>(prim
, state
.vpMatrices
);
2721 // adjust for pixel center location
2722 simd16scalar offset
= g_pixelOffsets_simd16
[rastState
.pixelLocation
];
2724 prim
[0].x
= _simd16_add_ps(prim
[0].x
, offset
);
2725 prim
[0].y
= _simd16_add_ps(prim
[0].y
, offset
);
2727 prim
[1].x
= _simd16_add_ps(prim
[1].x
, offset
);
2728 prim
[1].y
= _simd16_add_ps(prim
[1].y
, offset
);
2730 BinPostSetupLines_simd16(