1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices
[7];
40 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
41 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
42 #define CLIPCODE_SHIFT 23
43 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
44 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
45 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
46 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
48 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
49 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
51 NEGW
= (0x40 << CLIPCODE_SHIFT
),
53 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
54 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
55 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
56 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
59 #define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
60 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
62 void Clip(const float *pTriangle
, const float *pAttribs
, int numAttribs
, float *pOutTriangles
,
63 int *numVerts
, float *pOutAttribs
);
66 void ComputeClipCodes(const API_STATE
& state
, const simdvector
& vertex
, simdscalar
& clipCodes
, simdscalari viewportIndexes
)
68 clipCodes
= _simd_setzero_ps();
71 simdscalar vNegW
= _simd_mul_ps(vertex
.w
, _simd_set1_ps(-1.0f
));
74 simdscalar vRes
= _simd_cmplt_ps(vertex
.x
, vNegW
);
75 clipCodes
= _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT
)));
78 vRes
= _simd_cmplt_ps(vertex
.y
, vNegW
);
79 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP
))));
82 vRes
= _simd_cmpgt_ps(vertex
.x
, vertex
.w
);
83 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT
))));
86 vRes
= _simd_cmpgt_ps(vertex
.y
, vertex
.w
);
87 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM
))));
89 if (state
.rastState
.depthClipEnable
)
92 // DX clips depth [0..w], GL clips [-w..w]
93 if (state
.rastState
.clipHalfZ
)
95 vRes
= _simd_cmplt_ps(vertex
.z
, _simd_setzero_ps());
99 vRes
= _simd_cmplt_ps(vertex
.z
, vNegW
);
101 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR
))));
104 vRes
= _simd_cmpgt_ps(vertex
.z
, vertex
.w
);
105 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR
))));
109 vRes
= _simd_cmple_ps(vertex
.w
, _simd_setzero_ps());
110 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(NEGW
))));
113 simdscalar gbMult
= _simd_mul_ps(vNegW
, _simd_i32gather_ps(&state
.gbState
.left
[0], viewportIndexes
, 4));
114 vRes
= _simd_cmplt_ps(vertex
.x
, gbMult
);
115 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT
))));
118 gbMult
= _simd_mul_ps(vNegW
, _simd_i32gather_ps(&state
.gbState
.top
[0], viewportIndexes
, 4));
119 vRes
= _simd_cmplt_ps(vertex
.y
, gbMult
);
120 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP
))));
123 gbMult
= _simd_mul_ps(vertex
.w
, _simd_i32gather_ps(&state
.gbState
.right
[0], viewportIndexes
, 4));
124 vRes
= _simd_cmpgt_ps(vertex
.x
, gbMult
);
125 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT
))));
128 gbMult
= _simd_mul_ps(vertex
.w
, _simd_i32gather_ps(&state
.gbState
.bottom
[0], viewportIndexes
, 4));
129 vRes
= _simd_cmpgt_ps(vertex
.y
, gbMult
);
130 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM
))));
133 template<uint32_t NumVertsPerPrim
>
137 Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
138 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
140 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
143 void ComputeClipCodes(simdvector vertex
[], simdscalari viewportIndexes
)
145 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
147 ::ComputeClipCodes(this->state
, vertex
[i
], this->clipCodes
[i
], viewportIndexes
);
151 simdscalar
ComputeClipCodeIntersection()
153 simdscalar result
= this->clipCodes
[0];
154 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
156 result
= _simd_and_ps(result
, this->clipCodes
[i
]);
161 simdscalar
ComputeClipCodeUnion()
163 simdscalar result
= this->clipCodes
[0];
164 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
166 result
= _simd_or_ps(result
, this->clipCodes
[i
]);
171 int ComputeNegWMask()
173 simdscalar clipCodeUnion
= ComputeClipCodeUnion();
174 clipCodeUnion
= _simd_and_ps(clipCodeUnion
, _simd_castsi_ps(_simd_set1_epi32(NEGW
)));
175 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion
, _simd_setzero_ps()));
178 int ComputeClipMask()
180 simdscalar clipUnion
= ComputeClipCodeUnion();
181 clipUnion
= _simd_and_ps(clipUnion
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK
)));
182 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion
, _simd_setzero_ps()));
185 // clipper is responsible for culling any prims with NAN coordinates
186 int ComputeNaNMask(simdvector prim
[])
188 simdscalar vNanMask
= _simd_setzero_ps();
189 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
191 simdscalar vNan01
= _simd_cmp_ps(prim
[e
].v
[0], prim
[e
].v
[1], _CMP_UNORD_Q
);
192 vNanMask
= _simd_or_ps(vNanMask
, vNan01
);
193 simdscalar vNan23
= _simd_cmp_ps(prim
[e
].v
[2], prim
[e
].v
[3], _CMP_UNORD_Q
);
194 vNanMask
= _simd_or_ps(vNanMask
, vNan23
);
197 return _simd_movemask_ps(vNanMask
);
200 int ComputeUserClipCullMask(PA_STATE
& pa
, simdvector prim
[])
202 uint8_t cullMask
= this->state
.rastState
.cullDistanceMask
;
203 simdscalar vClipCullMask
= _simd_setzero_ps();
206 simdvector vClipCullDistLo
[3];
207 simdvector vClipCullDistHi
[3];
209 pa
.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT
, vClipCullDistLo
);
210 pa
.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT
, vClipCullDistHi
);
211 while (_BitScanForward(&index
, cullMask
))
213 cullMask
&= ~(1 << index
);
214 uint32_t slot
= index
>> 2;
215 uint32_t component
= index
& 0x3;
217 simdscalar vCullMaskElem
= _simd_set1_ps(-1.0f
);
218 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
220 simdscalar vCullComp
;
223 vCullComp
= vClipCullDistLo
[e
][component
];
227 vCullComp
= vClipCullDistHi
[e
][component
];
230 // cull if cull distance < 0 || NAN
231 simdscalar vCull
= _simd_cmp_ps(_mm256_setzero_ps(), vCullComp
, _CMP_NLE_UQ
);
232 vCullMaskElem
= _simd_and_ps(vCullMaskElem
, vCull
);
234 vClipCullMask
= _simd_or_ps(vClipCullMask
, vCullMaskElem
);
237 // clipper should also discard any primitive with NAN clip distance
238 uint8_t clipMask
= this->state
.rastState
.clipDistanceMask
;
239 while (_BitScanForward(&index
, clipMask
))
241 clipMask
&= ~(1 << index
);
242 uint32_t slot
= index
>> 2;
243 uint32_t component
= index
& 0x3;
245 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
247 simdscalar vClipComp
;
250 vClipComp
= vClipCullDistLo
[e
][component
];
254 vClipComp
= vClipCullDistHi
[e
][component
];
257 simdscalar vClip
= _simd_cmp_ps(vClipComp
, vClipComp
, _CMP_UNORD_Q
);
258 vClipCullMask
= _simd_or_ps(vClipCullMask
, vClip
);
262 return _simd_movemask_ps(vClipCullMask
);
265 // clip a single primitive
266 int ClipScalar(PA_STATE
& pa
, uint32_t primIndex
, float* pOutPos
, float* pOutAttribs
)
268 OSALIGNSIMD(float) inVerts
[3 * 4];
269 OSALIGNSIMD(float) inAttribs
[3 * KNOB_NUM_ATTRIBUTES
* 4];
271 // transpose primitive position
273 pa
.AssembleSingle(VERTEX_POSITION_SLOT
, primIndex
, verts
);
274 _mm_store_ps(&inVerts
[0], verts
[0]);
275 _mm_store_ps(&inVerts
[4], verts
[1]);
276 _mm_store_ps(&inVerts
[8], verts
[2]);
279 uint32_t numScalarAttribs
= this->state
.linkageCount
* 4;
284 uint32_t tmpLinkage
= uint32_t(this->state
.linkageMask
);
285 while (_BitScanForward(&slot
, tmpLinkage
))
287 tmpLinkage
&= ~(1 << slot
);
288 // Compute absolute attrib slot in vertex array
289 uint32_t inputSlot
= VERTEX_ATTRIB_START_SLOT
+ this->state
.linkageMap
[mapIdx
++];
290 __m128 attrib
[3]; // triangle attribs (always 4 wide)
291 pa
.AssembleSingle(inputSlot
, primIndex
, attrib
);
292 _mm_store_ps(&inAttribs
[idx
], attrib
[0]);
293 _mm_store_ps(&inAttribs
[idx
+ numScalarAttribs
], attrib
[1]);
294 _mm_store_ps(&inAttribs
[idx
+ numScalarAttribs
* 2], attrib
[2]);
299 Clip(inVerts
, inAttribs
, numScalarAttribs
, pOutPos
, &numVerts
, pOutAttribs
);
304 // clip SIMD primitives
305 void ClipSimd(const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, PA_STATE
& pa
, const simdscalari
& vPrimId
, const simdscalari
& vViewportIdx
)
307 // input/output vertex store for clipper
308 simdvertex vertices
[7]; // maximum 7 verts generated per triangle
310 LONG constantInterpMask
= this->state
.backendState
.constantInterpolationMask
;
311 uint32_t provokingVertex
= 0;
312 if(pa
.binTopology
== TOP_TRIANGLE_FAN
)
314 provokingVertex
= this->state
.frontendState
.provokingVertex
.triFan
;
316 ///@todo: line topology for wireframe?
319 simdvector tmpVector
[NumVertsPerPrim
];
320 pa
.Assemble(VERTEX_POSITION_SLOT
, tmpVector
);
321 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
323 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = tmpVector
[i
];
327 const SWR_BACKEND_STATE
& backendState
= this->state
.backendState
;
329 int32_t maxSlot
= -1;
330 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
332 // Compute absolute attrib slot in vertex array
333 uint32_t mapSlot
= backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
334 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
335 uint32_t inputSlot
= VERTEX_ATTRIB_START_SLOT
+ mapSlot
;
337 pa
.Assemble(inputSlot
, tmpVector
);
339 // if constant interpolation enabled for this attribute, assign the provoking
340 // vertex values to all edges
341 if (_bittest(&constantInterpMask
, slot
))
343 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
345 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
350 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
352 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
357 // assemble user clip distances if enabled
358 if (this->state
.rastState
.clipDistanceMask
& 0xf)
360 pa
.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT
, tmpVector
);
361 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
363 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
] = tmpVector
[i
];
367 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
369 pa
.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT
, tmpVector
);
370 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
372 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
] = tmpVector
[i
];
376 uint32_t numAttribs
= maxSlot
+ 1;
378 simdscalari vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
380 // set up new PA for binning clipped primitives
381 PFN_PROCESS_PRIMS pfnBinFunc
= nullptr;
382 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
383 if (NumVertsPerPrim
== 3)
385 pfnBinFunc
= GetBinTrianglesFunc((pa
.pDC
->pState
->state
.rastState
.conservativeRast
> 0));
386 clipTopology
= TOP_TRIANGLE_FAN
;
388 // so that the binner knows to bloat wide points later
389 if (pa
.binTopology
== TOP_POINT_LIST
)
390 clipTopology
= TOP_POINT_LIST
;
392 else if (NumVertsPerPrim
== 2)
394 pfnBinFunc
= BinLines
;
395 clipTopology
= TOP_LINE_LIST
;
399 SWR_ASSERT(0 && "Unexpected points in clipper.");
403 uint32_t* pVertexCount
= (uint32_t*)&vNumClippedVerts
;
404 uint32_t* pPrimitiveId
= (uint32_t*)&vPrimId
;
405 uint32_t* pViewportIdx
= (uint32_t*)&vViewportIdx
;
407 const simdscalari vOffsets
= _mm256_set_epi32(
408 0 * sizeof(simdvertex
), // unused lane
409 6 * sizeof(simdvertex
),
410 5 * sizeof(simdvertex
),
411 4 * sizeof(simdvertex
),
412 3 * sizeof(simdvertex
),
413 2 * sizeof(simdvertex
),
414 1 * sizeof(simdvertex
),
415 0 * sizeof(simdvertex
));
417 // only need to gather 7 verts
418 // @todo dynamic mask based on actual # of verts generated per lane
419 const simdscalar vMask
= _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
421 uint32_t numClippedPrims
= 0;
422 for (uint32_t inputPrim
= 0; inputPrim
< pa
.NumPrims(); ++inputPrim
)
424 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
425 if (numEmittedVerts
< NumVertsPerPrim
)
429 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
431 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
432 numClippedPrims
+= numEmittedPrims
;
434 // tranpose clipper output so that each lane's vertices are in SIMD order
435 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
437 simdvertex transposedPrims
[2];
440 uint8_t* pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
441 for (uint32_t c
= 0; c
< 4; ++c
)
443 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
444 pBase
+= sizeof(simdscalar
);
448 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_ATTRIB_START_SLOT
]) + sizeof(float) * inputPrim
;
449 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
451 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ attrib
;
452 for (uint32_t c
= 0; c
< 4; ++c
)
454 transposedPrims
[0].attrib
[attribSlot
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
455 pBase
+= sizeof(simdscalar
);
459 // transpose user clip distances if enabled
460 if (this->state
.rastState
.clipDistanceMask
& 0xf)
462 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
]) + sizeof(float) * inputPrim
;
463 for (uint32_t c
= 0; c
< 4; ++c
)
465 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
466 pBase
+= sizeof(simdscalar
);
470 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
472 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
]) + sizeof(float) * inputPrim
;
473 for (uint32_t c
= 0; c
< 4; ++c
)
475 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
476 pBase
+= sizeof(simdscalar
);
480 PA_STATE_OPT
clipPa(this->pDC
, numEmittedPrims
, (uint8_t*)&transposedPrims
[0], numEmittedVerts
, true, clipTopology
);
482 while (clipPa
.GetNextStreamOutput())
486 simdvector attrib
[NumVertsPerPrim
];
487 bool assemble
= clipPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
490 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
491 pfnBinFunc(this->pDC
, clipPa
, this->workerId
, attrib
, primMaskMap
[numEmittedPrims
], _simd_set1_epi32(pPrimitiveId
[inputPrim
]), _simd_set1_epi32(pViewportIdx
[inputPrim
]));
493 } while (clipPa
.NextPrim());
497 // update global pipeline stat
498 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
501 // execute the clipper stage
502 void ExecuteStage(PA_STATE
& pa
, simdvector prim
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
)
504 SWR_ASSERT(pa
.pDC
!= nullptr);
505 SWR_CONTEXT
* pContext
= pa
.pDC
->pContext
;
507 // set up binner based on PA state
508 PFN_PROCESS_PRIMS pfnBinner
;
509 switch (pa
.binTopology
)
512 pfnBinner
= BinPoints
;
517 case TOP_LINE_LIST_ADJ
:
518 case TOP_LISTSTRIP_ADJ
:
519 pfnBinner
= BinLines
;
522 pfnBinner
= GetBinTrianglesFunc((pa
.pDC
->pState
->state
.rastState
.conservativeRast
> 0));
527 // update clipper invocations pipeline stat
528 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
529 UPDATE_STAT_FE(CInvocations
, numInvoc
);
531 ComputeClipCodes(prim
, viewportIdx
);
533 // cull prims with NAN coords
534 primMask
&= ~ComputeNaNMask(prim
);
536 // user cull distance cull
537 if (this->state
.rastState
.cullDistanceMask
)
539 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
542 // cull prims outside view frustum
543 simdscalar clipIntersection
= ComputeClipCodeIntersection();
544 int validMask
= primMask
& _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection
, _simd_setzero_ps()));
546 // skip clipping for points
547 uint32_t clipMask
= 0;
548 if (NumVertsPerPrim
!= 1)
550 clipMask
= primMask
& ComputeClipMask();
555 AR_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
556 // we have to clip tris, execute the clipper, which will also
558 ClipSimd(vMask(primMask
), vMask(clipMask
), pa
, primId
, viewportIdx
);
559 AR_END(FEGuardbandClip
, 1);
563 // update CPrimitives pipeline state
564 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
566 // forward valid prims directly to binner
567 pfnBinner(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
, viewportIdx
);
572 inline simdscalar
ComputeInterpFactor(simdscalar boundaryCoord0
, simdscalar boundaryCoord1
)
574 return _simd_div_ps(boundaryCoord0
, _simd_sub_ps(boundaryCoord0
, boundaryCoord1
));
577 inline simdscalari
ComputeOffsets(uint32_t attrib
, simdscalari vIndices
, uint32_t component
)
579 const uint32_t simdVertexStride
= sizeof(simdvertex
);
580 const uint32_t componentStride
= sizeof(simdscalar
);
581 const uint32_t attribStride
= sizeof(simdvector
);
582 const __m256i vElemOffset
= _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
583 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
585 // step to the simdvertex
586 simdscalari vOffsets
= _simd_mullo_epi32(vIndices
, _simd_set1_epi32(simdVertexStride
));
588 // step to the attribute and component
589 vOffsets
= _simd_add_epi32(vOffsets
, _simd_set1_epi32(attribStride
* attrib
+ componentStride
* component
));
592 vOffsets
= _simd_add_epi32(vOffsets
, vElemOffset
);
597 // gathers a single component for a given attribute for each SIMD lane
598 inline simdscalar
GatherComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
)
600 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
601 simdscalar vSrc
= _mm256_undefined_ps();
602 return _simd_mask_i32gather_ps(vSrc
, pBuffer
, vOffsets
, vMask
, 1);
605 inline void ScatterComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
, simdscalar vSrc
)
607 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
609 uint32_t* pOffsets
= (uint32_t*)&vOffsets
;
610 float* pSrc
= (float*)&vSrc
;
611 uint32_t mask
= _simd_movemask_ps(vMask
);
613 while (_BitScanForward(&lane
, mask
))
615 mask
&= ~(1 << lane
);
616 uint8_t* pBuf
= (uint8_t*)pBuffer
+ pOffsets
[lane
];
617 *(float*)pBuf
= pSrc
[lane
];
621 template<SWR_CLIPCODES ClippingPlane
>
622 inline void intersect(
623 const simdscalar
& vActiveMask
, // active lanes to operate on
624 const simdscalari
& s
, // index to first edge vertex v0 in pInPts.
625 const simdscalari
& p
, // index to second edge vertex v1 in pInPts.
626 const simdvector
& v1
, // vertex 0 position
627 const simdvector
& v2
, // vertex 1 position
628 simdscalari
& outIndex
, // output index.
629 const float *pInVerts
, // array of all the input positions.
630 uint32_t numInAttribs
, // number of attributes per vertex.
631 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
633 // compute interpolation factor
635 switch (ClippingPlane
)
637 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[0]), _simd_add_ps(v2
[3], v2
[0])); break;
638 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[0]), _simd_sub_ps(v2
[3], v2
[0])); break;
639 case FRUSTUM_TOP
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[1]), _simd_add_ps(v2
[3], v2
[1])); break;
640 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[1]), _simd_sub_ps(v2
[3], v2
[1])); break;
642 // DX Znear plane is 0, GL is -w
643 if (this->state
.rastState
.clipHalfZ
)
645 t
= ComputeInterpFactor(v1
[2], v2
[2]);
649 t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[2]), _simd_add_ps(v2
[3], v2
[2]));
652 case FRUSTUM_FAR
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[2]), _simd_sub_ps(v2
[3], v2
[2])); break;
653 default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane
);
656 // interpolate position and store
657 for (uint32_t c
= 0; c
< 4; ++c
)
659 simdscalar vOutPos
= _simd_fmadd_ps(_simd_sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
660 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
663 // interpolate attributes and store
664 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
666 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
667 for (uint32_t c
= 0; c
< 4; ++c
)
669 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
670 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
671 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
672 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
676 // interpolate clip distance if enabled
677 if (this->state
.rastState
.clipDistanceMask
& 0xf)
679 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
680 for (uint32_t c
= 0; c
< 4; ++c
)
682 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
683 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
684 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
685 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
689 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
691 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
692 for (uint32_t c
= 0; c
< 4; ++c
)
694 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
695 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
696 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
697 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
702 template<SWR_CLIPCODES ClippingPlane
>
703 inline simdscalar
inside(const simdvector
& v
)
705 switch (ClippingPlane
)
707 case FRUSTUM_LEFT
: return _simd_cmpge_ps(v
[0], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
708 case FRUSTUM_RIGHT
: return _simd_cmple_ps(v
[0], v
[3]);
709 case FRUSTUM_TOP
: return _simd_cmpge_ps(v
[1], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
710 case FRUSTUM_BOTTOM
: return _simd_cmple_ps(v
[1], v
[3]);
711 case FRUSTUM_NEAR
: return _simd_cmpge_ps(v
[2], this->state
.rastState
.clipHalfZ
? _simd_setzero_ps() : _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
712 case FRUSTUM_FAR
: return _simd_cmple_ps(v
[2], v
[3]);
714 SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane
);
715 return _simd_setzero_ps();
719 template<SWR_CLIPCODES ClippingPlane
>
720 simdscalari
ClipTriToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
722 simdscalari vCurIndex
= _simd_setzero_si();
723 simdscalari vOutIndex
= _simd_setzero_si();
724 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
726 while (!_simd_testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
728 simdscalari s
= vCurIndex
;
729 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
730 simdscalari underFlowMask
= _simd_cmpgt_epi32(vNumInPts
, p
);
731 p
= _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p
), _simd_castsi_ps(underFlowMask
)));
734 simdvector vInPos0
, vInPos1
;
735 for (uint32_t c
= 0; c
< 4; ++c
)
737 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
738 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
741 // compute inside mask
742 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
743 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
745 // compute intersection mask (s_in != p_in)
746 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
747 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
750 s_in
= _simd_and_ps(s_in
, vActiveMask
);
751 if (!_simd_testz_ps(s_in
, s_in
))
754 for (uint32_t c
= 0; c
< 4; ++c
)
756 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
760 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
762 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
763 for (uint32_t c
= 0; c
< 4; ++c
)
765 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
766 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
770 // store clip distance if enabled
771 if (this->state
.rastState
.clipDistanceMask
& 0xf)
773 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
774 for (uint32_t c
= 0; c
< 4; ++c
)
776 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
777 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
781 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
783 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
784 for (uint32_t c
= 0; c
< 4; ++c
)
786 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
787 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
791 // increment outIndex
792 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
795 // compute and store intersection
796 if (!_simd_testz_ps(intersectMask
, intersectMask
))
798 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
800 // increment outIndex for active lanes
801 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
804 // increment loop index and update active mask
805 vCurIndex
= _simd_add_epi32(vCurIndex
, _simd_set1_epi32(1));
806 vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
812 template<SWR_CLIPCODES ClippingPlane
>
813 simdscalari
ClipLineToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
815 simdscalari vCurIndex
= _simd_setzero_si();
816 simdscalari vOutIndex
= _simd_setzero_si();
817 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
819 if (!_simd_testz_ps(vActiveMask
, vActiveMask
))
821 simdscalari s
= vCurIndex
;
822 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
825 simdvector vInPos0
, vInPos1
;
826 for (uint32_t c
= 0; c
< 4; ++c
)
828 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
829 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
832 // compute inside mask
833 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
834 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
836 // compute intersection mask (s_in != p_in)
837 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
838 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
841 s_in
= _simd_and_ps(s_in
, vActiveMask
);
842 if (!_simd_testz_ps(s_in
, s_in
))
844 for (uint32_t c
= 0; c
< 4; ++c
)
846 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
849 // interpolate attributes and store
850 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
852 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
853 for (uint32_t c
= 0; c
< 4; ++c
)
855 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
856 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
860 // increment outIndex
861 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
864 // compute and store intersection
865 if (!_simd_testz_ps(intersectMask
, intersectMask
))
867 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
869 // increment outIndex for active lanes
870 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
874 p_in
= _simd_and_ps(p_in
, vActiveMask
);
875 if (!_simd_testz_ps(p_in
, p_in
))
877 for (uint32_t c
= 0; c
< 4; ++c
)
879 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
882 // interpolate attributes and store
883 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
885 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
886 for (uint32_t c
= 0; c
< 4; ++c
)
888 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
889 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
893 // increment outIndex
894 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), p_in
);
901 //////////////////////////////////////////////////////////////////////////
902 /// @brief Vertical clipper. Clips SIMD primitives at a time
903 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
904 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
905 /// @param numAttribs - number of valid input attribs, including position
906 simdscalari
ClipPrims(float* pVertices
, const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, int numAttribs
)
909 float* pTempVerts
= (float*)&tlsTempVertices
[0];
911 // zero out num input verts for non-active lanes
912 simdscalari vNumInPts
= _simd_set1_epi32(NumVertsPerPrim
);
913 vNumInPts
= _simd_blendv_epi32(_simd_setzero_si(), vNumInPts
, vClipMask
);
915 // clip prims to frustum
916 simdscalari vNumOutPts
;
917 if (NumVertsPerPrim
== 3)
919 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
920 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
921 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
922 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
923 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
924 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
928 SWR_ASSERT(NumVertsPerPrim
== 2);
929 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
930 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
931 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
932 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
933 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
934 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
937 // restore num verts for non-clipped, active lanes
938 simdscalar vNonClippedMask
= _simd_andnot_ps(vClipMask
, vPrimMask
);
939 vNumOutPts
= _simd_blendv_epi32(vNumOutPts
, _simd_set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
944 const uint32_t workerId
{ 0 };
945 DRAW_CONTEXT
* pDC
{ nullptr };
946 const API_STATE
& state
;
947 simdscalar clipCodes
[NumVertsPerPrim
];
951 // pipeline stage functions
952 void ClipTriangles(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
);
953 void ClipLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
);
954 void ClipPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
);