1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
37 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
38 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
39 #define CLIPCODE_SHIFT 23
40 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
41 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
42 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
43 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
45 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
46 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
48 NEGW
= (0x40 << CLIPCODE_SHIFT
),
50 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
51 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
52 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
53 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
56 #define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
57 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
59 void Clip(const float *pTriangle
, const float *pAttribs
, int numAttribs
, float *pOutTriangles
,
60 int *numVerts
, float *pOutAttribs
);
63 void ComputeClipCodes(DRIVER_TYPE type
, const API_STATE
& state
, const simdvector
& vertex
, simdscalar
& clipCodes
)
65 clipCodes
= _simd_setzero_ps();
68 simdscalar vNegW
= _simd_mul_ps(vertex
.w
, _simd_set1_ps(-1.0f
));
71 simdscalar vRes
= _simd_cmplt_ps(vertex
.x
, vNegW
);
72 clipCodes
= _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT
)));
75 vRes
= _simd_cmplt_ps(vertex
.y
, vNegW
);
76 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP
))));
79 vRes
= _simd_cmpgt_ps(vertex
.x
, vertex
.w
);
80 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT
))));
83 vRes
= _simd_cmpgt_ps(vertex
.y
, vertex
.w
);
84 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM
))));
86 if (state
.rastState
.depthClipEnable
)
89 // DX clips depth [0..w], GL clips [-w..w]
92 vRes
= _simd_cmplt_ps(vertex
.z
, _simd_setzero_ps());
96 vRes
= _simd_cmplt_ps(vertex
.z
, vNegW
);
98 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR
))));
101 vRes
= _simd_cmpgt_ps(vertex
.z
, vertex
.w
);
102 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR
))));
106 vRes
= _simd_cmple_ps(vertex
.w
, _simd_setzero_ps());
107 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(NEGW
))));
110 simdscalar gbMult
= _simd_mul_ps(vNegW
, _simd_set1_ps(state
.gbState
.left
));
111 vRes
= _simd_cmplt_ps(vertex
.x
, gbMult
);
112 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT
))));
115 gbMult
= _simd_mul_ps(vNegW
, _simd_set1_ps(state
.gbState
.top
));
116 vRes
= _simd_cmplt_ps(vertex
.y
, gbMult
);
117 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP
))));
120 gbMult
= _simd_mul_ps(vertex
.w
, _simd_set1_ps(state
.gbState
.right
));
121 vRes
= _simd_cmpgt_ps(vertex
.x
, gbMult
);
122 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT
))));
125 gbMult
= _simd_mul_ps(vertex
.w
, _simd_set1_ps(state
.gbState
.bottom
));
126 vRes
= _simd_cmpgt_ps(vertex
.y
, gbMult
);
127 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM
))));
130 template<uint32_t NumVertsPerPrim
>
134 Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
135 workerId(in_workerId
), driverType(in_pDC
->pContext
->driverType
), pDC(in_pDC
), state(GetApiState(in_pDC
))
137 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
140 void ComputeClipCodes(simdvector vertex
[])
142 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
144 ::ComputeClipCodes(this->driverType
, this->state
, vertex
[i
], this->clipCodes
[i
]);
148 simdscalar
ComputeClipCodeIntersection()
150 simdscalar result
= this->clipCodes
[0];
151 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
153 result
= _simd_and_ps(result
, this->clipCodes
[i
]);
158 simdscalar
ComputeClipCodeUnion()
160 simdscalar result
= this->clipCodes
[0];
161 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
163 result
= _simd_or_ps(result
, this->clipCodes
[i
]);
168 int ComputeNegWMask()
170 simdscalar clipCodeUnion
= ComputeClipCodeUnion();
171 clipCodeUnion
= _simd_and_ps(clipCodeUnion
, _simd_castsi_ps(_simd_set1_epi32(NEGW
)));
172 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion
, _simd_setzero_ps()));
175 int ComputeClipMask()
177 simdscalar clipUnion
= ComputeClipCodeUnion();
178 clipUnion
= _simd_and_ps(clipUnion
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK
)));
179 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion
, _simd_setzero_ps()));
182 // clipper is responsible for culling any prims with NAN coordinates
183 int ComputeNaNMask(simdvector prim
[])
185 simdscalar vNanMask
= _simd_setzero_ps();
186 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
188 simdscalar vNan01
= _simd_cmp_ps(prim
[e
].v
[0], prim
[e
].v
[1], _CMP_UNORD_Q
);
189 vNanMask
= _simd_or_ps(vNanMask
, vNan01
);
190 simdscalar vNan23
= _simd_cmp_ps(prim
[e
].v
[2], prim
[e
].v
[3], _CMP_UNORD_Q
);
191 vNanMask
= _simd_or_ps(vNanMask
, vNan23
);
194 return _simd_movemask_ps(vNanMask
);
197 int ComputeUserClipCullMask(PA_STATE
& pa
, simdvector prim
[])
199 uint8_t cullMask
= this->state
.rastState
.cullDistanceMask
;
200 simdscalar vClipCullMask
= _simd_setzero_ps();
203 simdvector vClipCullDistLo
[3];
204 simdvector vClipCullDistHi
[3];
206 pa
.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT
, vClipCullDistLo
);
207 pa
.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT
, vClipCullDistHi
);
208 while (_BitScanForward(&index
, cullMask
))
210 cullMask
&= ~(1 << index
);
211 uint32_t slot
= index
>> 2;
212 uint32_t component
= index
& 0x3;
214 simdscalar vCullMaskElem
= _simd_set1_ps(-1.0f
);
215 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
217 simdscalar vCullComp
;
220 vCullComp
= vClipCullDistLo
[e
][component
];
224 vCullComp
= vClipCullDistHi
[e
][component
];
227 // cull if cull distance < 0 || NAN
228 simdscalar vCull
= _simd_cmp_ps(_mm256_setzero_ps(), vCullComp
, _CMP_NLE_UQ
);
229 vCullMaskElem
= _simd_and_ps(vCullMaskElem
, vCull
);
231 vClipCullMask
= _simd_or_ps(vClipCullMask
, vCullMaskElem
);
234 // clipper should also discard any primitive with NAN clip distance
235 uint8_t clipMask
= this->state
.rastState
.clipDistanceMask
;
236 while (_BitScanForward(&index
, clipMask
))
238 clipMask
&= ~(1 << index
);
239 uint32_t slot
= index
>> 2;
240 uint32_t component
= index
& 0x3;
242 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
244 simdscalar vClipComp
;
247 vClipComp
= vClipCullDistLo
[e
][component
];
251 vClipComp
= vClipCullDistHi
[e
][component
];
254 simdscalar vClip
= _simd_cmp_ps(vClipComp
, vClipComp
, _CMP_UNORD_Q
);
255 vClipCullMask
= _simd_or_ps(vClipCullMask
, vClip
);
259 return _simd_movemask_ps(vClipCullMask
);
262 // clip a single primitive
263 int ClipScalar(PA_STATE
& pa
, uint32_t primIndex
, float* pOutPos
, float* pOutAttribs
)
265 OSALIGN(float, 16) inVerts
[3 * 4];
266 OSALIGN(float, 16) inAttribs
[3 * KNOB_NUM_ATTRIBUTES
* 4];
268 // transpose primitive position
270 pa
.AssembleSingle(VERTEX_POSITION_SLOT
, primIndex
, verts
);
271 _mm_store_ps(&inVerts
[0], verts
[0]);
272 _mm_store_ps(&inVerts
[4], verts
[1]);
273 _mm_store_ps(&inVerts
[8], verts
[2]);
276 uint32_t numScalarAttribs
= this->state
.linkageCount
* 4;
281 uint32_t tmpLinkage
= uint32_t(this->state
.linkageMask
);
282 while (_BitScanForward(&slot
, tmpLinkage
))
284 tmpLinkage
&= ~(1 << slot
);
285 // Compute absolute attrib slot in vertex array
286 uint32_t inputSlot
= VERTEX_ATTRIB_START_SLOT
+ this->state
.linkageMap
[mapIdx
++];
287 __m128 attrib
[3]; // triangle attribs (always 4 wide)
288 pa
.AssembleSingle(inputSlot
, primIndex
, attrib
);
289 _mm_store_ps(&inAttribs
[idx
], attrib
[0]);
290 _mm_store_ps(&inAttribs
[idx
+ numScalarAttribs
], attrib
[1]);
291 _mm_store_ps(&inAttribs
[idx
+ numScalarAttribs
* 2], attrib
[2]);
296 Clip(inVerts
, inAttribs
, numScalarAttribs
, pOutPos
, &numVerts
, pOutAttribs
);
301 // clip SIMD primitives
302 void ClipSimd(const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, PA_STATE
& pa
, const simdscalari
& vPrimId
)
304 // input/output vertex store for clipper
305 simdvertex vertices
[7]; // maximum 7 verts generated per triangle
307 LONG constantInterpMask
= this->state
.backendState
.constantInterpolationMask
;
308 uint32_t provokingVertex
= 0;
309 if(pa
.binTopology
== TOP_TRIANGLE_FAN
)
311 provokingVertex
= this->state
.frontendState
.provokingVertex
.triFan
;
313 ///@todo: line topology for wireframe?
316 simdvector tmpVector
[NumVertsPerPrim
];
317 pa
.Assemble(VERTEX_POSITION_SLOT
, tmpVector
);
318 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
320 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = tmpVector
[i
];
326 uint32_t tmpLinkage
= this->state
.linkageMask
;
328 int32_t maxSlot
= -1;
329 while (_BitScanForward(&slot
, tmpLinkage
))
331 tmpLinkage
&= ~(1 << slot
);
332 // Compute absolute attrib slot in vertex array
333 uint32_t mapSlot
= this->state
.linkageMap
[mapIdx
++];
334 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
335 uint32_t inputSlot
= VERTEX_ATTRIB_START_SLOT
+ mapSlot
;
337 pa
.Assemble(inputSlot
, tmpVector
);
339 // if constant interpolation enabled for this attribute, assign the provoking
340 // vertex values to all edges
341 if (_bittest(&constantInterpMask
, slot
))
343 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
345 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
350 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
352 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
357 uint32_t numAttribs
= maxSlot
+ 1;
359 simdscalari vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
361 // set up new PA for binning clipped primitives
362 PFN_PROCESS_PRIMS pfnBinFunc
= nullptr;
363 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
364 if (NumVertsPerPrim
== 3)
366 pfnBinFunc
= BinTriangles
;
367 clipTopology
= TOP_TRIANGLE_FAN
;
369 // so that the binner knows to bloat wide points later
370 if (pa
.binTopology
== TOP_POINT_LIST
)
371 clipTopology
= TOP_POINT_LIST
;
373 else if (NumVertsPerPrim
== 2)
375 pfnBinFunc
= BinLines
;
376 clipTopology
= TOP_LINE_LIST
;
380 SWR_ASSERT(0 && "Unexpected points in clipper.");
384 uint32_t* pVertexCount
= (uint32_t*)&vNumClippedVerts
;
385 uint32_t* pPrimitiveId
= (uint32_t*)&vPrimId
;
387 const simdscalari vOffsets
= _mm256_set_epi32(
388 0 * sizeof(simdvertex
), // unused lane
389 6 * sizeof(simdvertex
),
390 5 * sizeof(simdvertex
),
391 4 * sizeof(simdvertex
),
392 3 * sizeof(simdvertex
),
393 2 * sizeof(simdvertex
),
394 1 * sizeof(simdvertex
),
395 0 * sizeof(simdvertex
));
397 // only need to gather 7 verts
398 // @todo dynamic mask based on actual # of verts generated per lane
399 const simdscalar vMask
= _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
401 uint32_t numClippedPrims
= 0;
402 for (uint32_t inputPrim
= 0; inputPrim
< pa
.NumPrims(); ++inputPrim
)
404 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
405 if (numEmittedVerts
< NumVertsPerPrim
)
409 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
411 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
412 numClippedPrims
+= numEmittedPrims
;
414 // tranpose clipper output so that each lane's vertices are in SIMD order
415 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
417 simdvertex transposedPrims
[2];
420 uint8_t* pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
421 for (uint32_t c
= 0; c
< 4; ++c
)
423 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
424 pBase
+= sizeof(simdscalar
);
428 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_ATTRIB_START_SLOT
]) + sizeof(float) * inputPrim
;
429 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
431 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ attrib
;
432 for (uint32_t c
= 0; c
< 4; ++c
)
434 transposedPrims
[0].attrib
[attribSlot
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
435 pBase
+= sizeof(simdscalar
);
439 PA_STATE_OPT
clipPa(this->pDC
, numEmittedPrims
, (uint8_t*)&transposedPrims
[0], numEmittedVerts
, true, clipTopology
);
441 while (clipPa
.GetNextStreamOutput())
445 simdvector attrib
[NumVertsPerPrim
];
446 bool assemble
= clipPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
449 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
450 pfnBinFunc(this->pDC
, clipPa
, this->workerId
, attrib
, primMaskMap
[numEmittedPrims
], _simd_set1_epi32(pPrimitiveId
[inputPrim
]));
452 } while (clipPa
.NextPrim());
456 // update global pipeline stat
457 SWR_CONTEXT
* pContext
= this->pDC
->pContext
;
458 UPDATE_STAT(CPrimitives
, numClippedPrims
);
461 // execute the clipper stage
462 void ExecuteStage(PA_STATE
& pa
, simdvector prim
[], uint32_t primMask
, simdscalari primId
)
464 // set up binner based on PA state
465 PFN_PROCESS_PRIMS pfnBinner
;
466 switch (pa
.binTopology
)
469 pfnBinner
= BinPoints
;
474 case TOP_LINE_LIST_ADJ
:
475 case TOP_LISTSTRIP_ADJ
:
476 pfnBinner
= BinLines
;
479 pfnBinner
= BinTriangles
;
483 // update clipper invocations pipeline stat
484 SWR_CONTEXT
* pContext
= this->pDC
->pContext
;
485 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
486 UPDATE_STAT(CInvocations
, numInvoc
);
488 ComputeClipCodes(prim
);
490 // cull prims with NAN coords
491 primMask
&= ~ComputeNaNMask(prim
);
493 // user cull distance cull
494 if (this->state
.rastState
.cullDistanceMask
)
496 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
499 // cull prims outside view frustum
500 simdscalar clipIntersection
= ComputeClipCodeIntersection();
501 int validMask
= primMask
& _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection
, _simd_setzero_ps()));
503 // skip clipping for points
504 uint32_t clipMask
= 0;
505 if (NumVertsPerPrim
!= 1)
507 clipMask
= primMask
& ComputeClipMask();
512 RDTSC_START(FEGuardbandClip
);
513 // we have to clip tris, execute the clipper, which will also
515 ClipSimd(vMask(primMask
), vMask(clipMask
), pa
, primId
);
516 RDTSC_STOP(FEGuardbandClip
, 1, 0);
520 // update CPrimitives pipeline state
521 SWR_CONTEXT
* pContext
= this->pDC
->pContext
;
522 UPDATE_STAT(CPrimitives
, _mm_popcnt_u32(validMask
));
524 // forward valid prims directly to binner
525 pfnBinner(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
);
530 inline simdscalar
ComputeInterpFactor(simdscalar boundaryCoord0
, simdscalar boundaryCoord1
)
532 return _simd_div_ps(boundaryCoord0
, _simd_sub_ps(boundaryCoord0
, boundaryCoord1
));
535 inline simdscalari
ComputeOffsets(uint32_t attrib
, simdscalari vIndices
, uint32_t component
)
537 const uint32_t simdVertexStride
= sizeof(simdvertex
);
538 const uint32_t componentStride
= sizeof(simdscalar
);
539 const uint32_t attribStride
= sizeof(simdvector
);
540 const __m256i vElemOffset
= _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
541 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
543 // step to the simdvertex
544 simdscalari vOffsets
= _simd_mullo_epi32(vIndices
, _simd_set1_epi32(simdVertexStride
));
546 // step to the attribute and component
547 vOffsets
= _simd_add_epi32(vOffsets
, _simd_set1_epi32(attribStride
* attrib
+ componentStride
* component
));
550 vOffsets
= _simd_add_epi32(vOffsets
, vElemOffset
);
555 // gathers a single component for a given attribute for each SIMD lane
556 inline simdscalar
GatherComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
)
558 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
559 simdscalar vSrc
= _mm256_undefined_ps();
560 return _simd_mask_i32gather_ps(vSrc
, pBuffer
, vOffsets
, vMask
, 1);
563 inline void ScatterComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
, simdscalar vSrc
)
565 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
567 uint32_t* pOffsets
= (uint32_t*)&vOffsets
;
568 float* pSrc
= (float*)&vSrc
;
569 uint32_t mask
= _simd_movemask_ps(vMask
);
571 while (_BitScanForward(&lane
, mask
))
573 mask
&= ~(1 << lane
);
574 uint8_t* pBuf
= (uint8_t*)pBuffer
+ pOffsets
[lane
];
575 *(float*)pBuf
= pSrc
[lane
];
579 template<SWR_CLIPCODES ClippingPlane
>
580 inline void intersect(
581 const simdscalar
& vActiveMask
, // active lanes to operate on
582 const simdscalari
& s
, // index to first edge vertex v0 in pInPts.
583 const simdscalari
& p
, // index to second edge vertex v1 in pInPts.
584 const simdvector
& v1
, // vertex 0 position
585 const simdvector
& v2
, // vertex 1 position
586 simdscalari
& outIndex
, // output index.
587 const float *pInVerts
, // array of all the input positions.
588 uint32_t numInAttribs
, // number of attributes per vertex.
589 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
591 // compute interpolation factor
593 switch (ClippingPlane
)
595 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[0]), _simd_add_ps(v2
[3], v2
[0])); break;
596 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[0]), _simd_sub_ps(v2
[3], v2
[0])); break;
597 case FRUSTUM_TOP
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[1]), _simd_add_ps(v2
[3], v2
[1])); break;
598 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[1]), _simd_sub_ps(v2
[3], v2
[1])); break;
600 // DX Znear plane is 0, GL is -w
601 if (this->driverType
== DX
)
603 t
= ComputeInterpFactor(v1
[2], v2
[2]);
607 t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[2]), _simd_add_ps(v2
[3], v2
[2]));
610 case FRUSTUM_FAR
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[2]), _simd_sub_ps(v2
[3], v2
[2])); break;
611 default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane
);
614 // interpolate position and store
615 for (uint32_t c
= 0; c
< 4; ++c
)
617 simdscalar vOutPos
= _simd_fmadd_ps(_simd_sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
618 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
621 // interpolate attributes and store
622 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
624 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
625 for (uint32_t c
= 0; c
< 4; ++c
)
627 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
628 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
629 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
630 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
635 template<SWR_CLIPCODES ClippingPlane
>
636 inline simdscalar
inside(const simdvector
& v
)
638 switch (ClippingPlane
)
640 case FRUSTUM_LEFT
: return _simd_cmpge_ps(v
[0], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
641 case FRUSTUM_RIGHT
: return _simd_cmple_ps(v
[0], v
[3]);
642 case FRUSTUM_TOP
: return _simd_cmpge_ps(v
[1], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
643 case FRUSTUM_BOTTOM
: return _simd_cmple_ps(v
[1], v
[3]);
644 case FRUSTUM_NEAR
: return _simd_cmpge_ps(v
[2], this->driverType
== DX
? _simd_setzero_ps() : _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
645 case FRUSTUM_FAR
: return _simd_cmple_ps(v
[2], v
[3]);
647 SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane
);
648 return _simd_setzero_ps();
652 template<SWR_CLIPCODES ClippingPlane
>
653 simdscalari
ClipTriToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
655 simdscalari vCurIndex
= _simd_setzero_si();
656 simdscalari vOutIndex
= _simd_setzero_si();
657 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
659 while (!_simd_testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
661 simdscalari s
= vCurIndex
;
662 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
663 simdscalari underFlowMask
= _simd_cmpgt_epi32(vNumInPts
, p
);
664 p
= _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p
), _simd_castsi_ps(underFlowMask
)));
667 simdvector vInPos0
, vInPos1
;
668 for (uint32_t c
= 0; c
< 4; ++c
)
670 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
671 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
674 // compute inside mask
675 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
676 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
678 // compute intersection mask (s_in != p_in)
679 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
680 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
683 s_in
= _simd_and_ps(s_in
, vActiveMask
);
684 if (!_simd_testz_ps(s_in
, s_in
))
687 for (uint32_t c
= 0; c
< 4; ++c
)
689 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
693 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
695 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
696 for (uint32_t c
= 0; c
< 4; ++c
)
698 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
699 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
703 // increment outIndex
704 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
707 // compute and store intersection
708 if (!_simd_testz_ps(intersectMask
, intersectMask
))
710 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
712 // increment outIndex for active lanes
713 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
716 // increment loop index and update active mask
717 vCurIndex
= _simd_add_epi32(vCurIndex
, _simd_set1_epi32(1));
718 vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
724 template<SWR_CLIPCODES ClippingPlane
>
725 simdscalari
ClipLineToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
727 simdscalari vCurIndex
= _simd_setzero_si();
728 simdscalari vOutIndex
= _simd_setzero_si();
729 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
731 if (!_simd_testz_ps(vActiveMask
, vActiveMask
))
733 simdscalari s
= vCurIndex
;
734 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
737 simdvector vInPos0
, vInPos1
;
738 for (uint32_t c
= 0; c
< 4; ++c
)
740 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
741 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
744 // compute inside mask
745 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
746 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
748 // compute intersection mask (s_in != p_in)
749 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
750 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
753 s_in
= _simd_and_ps(s_in
, vActiveMask
);
754 if (!_simd_testz_ps(s_in
, s_in
))
756 for (uint32_t c
= 0; c
< 4; ++c
)
758 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
761 // interpolate attributes and store
762 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
764 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
765 for (uint32_t c
= 0; c
< 4; ++c
)
767 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
768 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
772 // increment outIndex
773 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
776 // compute and store intersection
777 if (!_simd_testz_ps(intersectMask
, intersectMask
))
779 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
781 // increment outIndex for active lanes
782 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
786 p_in
= _simd_and_ps(p_in
, vActiveMask
);
787 if (!_simd_testz_ps(p_in
, p_in
))
789 for (uint32_t c
= 0; c
< 4; ++c
)
791 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
794 // interpolate attributes and store
795 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
797 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
798 for (uint32_t c
= 0; c
< 4; ++c
)
800 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
801 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
805 // increment outIndex
806 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), p_in
);
813 //////////////////////////////////////////////////////////////////////////
814 /// @brief Vertical clipper. Clips SIMD primitives at a time
815 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
816 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
817 /// @param numAttribs - number of valid input attribs, including position
818 simdscalari
ClipPrims(float* pVertices
, const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, int numAttribs
)
821 simdvertex tempVertices
[7];
822 float* pTempVerts
= (float*)&tempVertices
[0];
824 // zero out num input verts for non-active lanes
825 simdscalari vNumInPts
= _simd_set1_epi32(NumVertsPerPrim
);
826 vNumInPts
= _simd_blendv_epi32(_simd_setzero_si(), vNumInPts
, vClipMask
);
828 // clip prims to frustum
829 simdscalari vNumOutPts
;
830 if (NumVertsPerPrim
== 3)
832 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
833 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
834 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
835 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
836 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
837 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
841 SWR_ASSERT(NumVertsPerPrim
== 2);
842 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
843 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
844 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
845 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
846 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
847 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
850 // restore num verts for non-clipped, active lanes
851 simdscalar vNonClippedMask
= _simd_andnot_ps(vClipMask
, vPrimMask
);
852 vNumOutPts
= _simd_blendv_epi32(vNumOutPts
, _simd_set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
857 const uint32_t workerId
{ 0 };
858 const DRIVER_TYPE driverType
{ DX
};
859 DRAW_CONTEXT
* pDC
{ nullptr };
860 const API_STATE
& state
;
861 simdscalar clipCodes
[NumVertsPerPrim
];
865 // pipeline stage functions
866 void ClipTriangles(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
);
867 void ClipLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
);
868 void ClipPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
);