1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T
<SIMD256
> tlsTempVertices
[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T
<SIMD512
> tlsTempVertices_simd16
[7];
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
47 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
48 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
49 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
51 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
52 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
54 NEGW
= (0x40 << CLIPCODE_SHIFT
),
56 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
57 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
58 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
59 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
64 template<typename SIMD_T
>
65 void ComputeClipCodes(const API_STATE
&state
, const typename
SIMD_T::Vec4
&vertex
, typename
SIMD_T::Float
&clipCodes
, typename
SIMD_T::Integer
const &viewportIndexes
)
67 clipCodes
= SIMD_T::setzero_ps();
70 typename
SIMD_T::Float vNegW
= SIMD_T::mul_ps(vertex
.w
,SIMD_T::set1_ps(-1.0f
));
73 typename
SIMD_T::Float vRes
= SIMD_T::cmplt_ps(vertex
.x
, vNegW
);
74 clipCodes
= SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT
)));
77 vRes
= SIMD_T::cmplt_ps(vertex
.y
, vNegW
);
78 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP
))));
81 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, vertex
.w
);
82 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT
))));
85 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, vertex
.w
);
86 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM
))));
88 if (state
.rastState
.depthClipEnable
)
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state
.rastState
.clipHalfZ
)
94 vRes
= SIMD_T::cmplt_ps(vertex
.z
, SIMD_T::setzero_ps());
98 vRes
= SIMD_T::cmplt_ps(vertex
.z
, vNegW
);
100 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR
))));
103 vRes
= SIMD_T::cmpgt_ps(vertex
.z
, vertex
.w
);
104 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR
))));
108 vRes
= SIMD_T::cmple_ps(vertex
.w
, SIMD_T::setzero_ps());
109 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW
))));
112 typename
SIMD_T::Float gbMult
= SIMD_T::mul_ps(vNegW
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.left
[0], viewportIndexes
));
113 vRes
= SIMD_T::cmplt_ps(vertex
.x
, gbMult
);
114 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT
))));
117 gbMult
= SIMD_T::mul_ps(vNegW
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.top
[0], viewportIndexes
));
118 vRes
= SIMD_T::cmplt_ps(vertex
.y
, gbMult
);
119 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP
))));
122 gbMult
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.right
[0], viewportIndexes
));
123 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, gbMult
);
124 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT
))));
127 gbMult
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.bottom
[0], viewportIndexes
));
128 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, gbMult
);
129 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM
))));
132 template<typename SIMD_T
>
138 struct BinnerChooser
<SIMD256
>
140 PFN_PROCESS_PRIMS pfnBinFunc
;
142 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
145 if (numVertsPerPrim
== 3)
147 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
150 else if (numVertsPerPrim
== 2)
152 pfnBinFunc
= BinLines
;
156 SWR_ASSERT(0 && "Unexpected points in clipper.");
160 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
166 pfnBinFunc
= BinPoints
;
171 case TOP_LINE_LIST_ADJ
:
172 case TOP_LISTSTRIP_ADJ
:
173 pfnBinFunc
= BinLines
;
176 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
181 void BinFunc(DRAW_CONTEXT
*pDC
, PA_STATE
&pa
, uint32_t workerId
, SIMD256::Vec4 prims
[], uint32_t primMask
, SIMD256::Integer
const &primID
, SIMD256::Integer
&viewportIdx
, SIMD256::Integer
&rtIdx
)
183 SWR_ASSERT(pfnBinFunc
!= nullptr);
185 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
189 #if USE_SIMD16_FRONTEND
191 struct BinnerChooser
<SIMD512
>
193 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc
;
195 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
198 if (numVertsPerPrim
== 3)
200 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
203 else if (numVertsPerPrim
== 2)
205 pfnBinFunc
= BinLines_simd16
;
209 SWR_ASSERT(0 && "Unexpected points in clipper.");
213 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
219 pfnBinFunc
= BinPoints_simd16
;
224 case TOP_LINE_LIST_ADJ
:
225 case TOP_LISTSTRIP_ADJ
:
226 pfnBinFunc
= BinLines_simd16
;
229 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
234 void BinFunc(DRAW_CONTEXT
*pDC
, PA_STATE
&pa
, uint32_t workerId
, SIMD512::Vec4 prims
[], uint32_t primMask
, SIMD512::Integer
const &primID
, SIMD512::Integer
&viewportIdx
, SIMD512::Integer
&rtIdx
)
236 SWR_ASSERT(pfnBinFunc
!= nullptr);
238 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
243 template<typename SIMD_T
>
249 struct SimdHelper
<SIMD256
>
251 static SIMD256::Float
insert_lo_ps(SIMD256::Float a
)
256 static SIMD256::Mask
cmpeq_ps_mask(SIMD256::Float a
, SIMD256::Float b
)
258 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a
, b
));
262 #if USE_SIMD16_FRONTEND
264 struct SimdHelper
<SIMD512
>
266 static SIMD512::Float
insert_lo_ps(SIMD256::Float a
)
268 return SIMD512::insert_ps
<0>(SIMD512::setzero_ps(), a
);
271 static SIMD512::Mask
cmpeq_ps_mask(SIMD512::Float a
, SIMD512::Float b
)
273 return SIMD512::cmp_ps_mask
<SIMD16::CompareType::EQ_OQ
>(a
, b
);
278 // Temp storage used by the clipper
279 template<typename SIMD_T
>
285 struct ClipHelper
<SIMD256
>
287 static SIMDVERTEX_T
<SIMD256
> *GetTempVertices()
289 return tlsTempVertices
;
293 #if USE_SIMD16_FRONTEND
295 struct ClipHelper
<SIMD512
>
297 static SIMDVERTEX_T
<SIMD512
> *GetTempVertices()
299 return tlsTempVertices_simd16
;
304 template<typename SIMD_T
, uint32_t NumVertsPerPrim
>
308 INLINE
Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
309 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
311 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
314 void ComputeClipCodes(typename
SIMD_T::Vec4 vertex
[], const typename
SIMD_T::Integer
&viewportIndexes
)
316 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
318 ::ComputeClipCodes
<SIMD_T
>(state
, vertex
[i
], clipCodes
[i
], viewportIndexes
);
322 typename
SIMD_T::Float
ComputeClipCodeIntersection()
324 typename
SIMD_T::Float result
= clipCodes
[0];
326 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
328 result
= SIMD_T::and_ps(result
, clipCodes
[i
]);
334 typename
SIMD_T::Float
ComputeClipCodeUnion()
336 typename
SIMD_T::Float result
= clipCodes
[0];
338 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
340 result
= SIMD_T::or_ps(result
, clipCodes
[i
]);
346 int ComputeClipMask()
348 typename
SIMD_T::Float clipUnion
= ComputeClipCodeUnion();
350 clipUnion
= SIMD_T::and_ps(clipUnion
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK
)));
352 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion
, SIMD_T::setzero_ps()));
355 // clipper is responsible for culling any prims with NAN coordinates
356 int ComputeNaNMask(typename
SIMD_T::Vec4 prim
[])
358 typename
SIMD_T::Float vNanMask
= SIMD_T::setzero_ps();
360 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
362 typename
SIMD_T::Float vNan01
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[0], prim
[e
].v
[1]);
363 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan01
);
365 typename
SIMD_T::Float vNan23
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[2], prim
[e
].v
[3]);
366 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan23
);
369 return SIMD_T::movemask_ps(vNanMask
);
372 int ComputeUserClipCullMask(PA_STATE
&pa
, typename
SIMD_T::Vec4 prim
[])
374 uint8_t cullMask
= state
.backendState
.cullDistanceMask
;
375 uint32_t vertexClipCullOffset
= state
.backendState
.vertexClipCullOffset
;
377 typename
SIMD_T::Float vClipCullMask
= SIMD_T::setzero_ps();
379 typename
SIMD_T::Vec4 vClipCullDistLo
[3];
380 typename
SIMD_T::Vec4 vClipCullDistHi
[3];
382 pa
.Assemble(vertexClipCullOffset
, vClipCullDistLo
);
383 pa
.Assemble(vertexClipCullOffset
+ 1, vClipCullDistHi
);
386 while (_BitScanForward(&index
, cullMask
))
388 cullMask
&= ~(1 << index
);
389 uint32_t slot
= index
>> 2;
390 uint32_t component
= index
& 0x3;
392 typename
SIMD_T::Float vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
393 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
395 typename
SIMD_T::Float vCullComp
;
398 vCullComp
= vClipCullDistLo
[e
][component
];
402 vCullComp
= vClipCullDistHi
[e
][component
];
405 // cull if cull distance < 0 || NAN
406 typename
SIMD_T::Float vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(SIMD_T::setzero_ps(), vCullComp
);
407 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
409 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
412 // clipper should also discard any primitive with NAN clip distance
413 uint8_t clipMask
= state
.backendState
.clipDistanceMask
;
414 while (_BitScanForward(&index
, clipMask
))
416 clipMask
&= ~(1 << index
);
417 uint32_t slot
= index
>> 2;
418 uint32_t component
= index
& 0x3;
420 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
422 typename
SIMD_T::Float vClipComp
;
425 vClipComp
= vClipCullDistLo
[e
][component
];
429 vClipComp
= vClipCullDistHi
[e
][component
];
432 typename
SIMD_T::Float vClip
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(vClipComp
, vClipComp
);
433 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vClip
);
437 return SIMD_T::movemask_ps(vClipCullMask
);
440 void ClipSimd(const typename
SIMD_T::Vec4 prim
[], const typename
SIMD_T::Float
&vPrimMask
, const typename
SIMD_T::Float
&vClipMask
, PA_STATE
&pa
,
441 const typename
SIMD_T::Integer
&vPrimId
, const typename
SIMD_T::Integer
&vViewportIdx
, const typename
SIMD_T::Integer
&vRtIdx
)
443 // input/output vertex store for clipper
444 SIMDVERTEX_T
<SIMD_T
> vertices
[7]; // maximum 7 verts generated per triangle
446 uint32_t constantInterpMask
= state
.backendState
.constantInterpolationMask
;
447 uint32_t provokingVertex
= 0;
448 if (pa
.binTopology
== TOP_TRIANGLE_FAN
)
450 provokingVertex
= state
.frontendState
.provokingVertex
.triFan
;
452 ///@todo: line topology for wireframe?
455 typename
SIMD_T::Vec4 tmpVector
[NumVertsPerPrim
];
456 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
458 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = prim
[i
];
462 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
464 int32_t maxSlot
= -1;
465 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
467 // Compute absolute attrib slot in vertex array
468 uint32_t mapSlot
= backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
469 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
470 uint32_t inputSlot
= backendState
.vertexAttribOffset
+ mapSlot
;
472 pa
.Assemble(inputSlot
, tmpVector
);
474 // if constant interpolation enabled for this attribute, assign the provoking
475 // vertex values to all edges
476 if (CheckBit(constantInterpMask
, slot
))
478 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
480 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
485 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
487 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
492 // assemble user clip distances if enabled
493 uint32_t vertexClipCullSlot
= state
.backendState
.vertexClipCullOffset
;
494 if (state
.backendState
.clipDistanceMask
& 0xf)
496 pa
.Assemble(vertexClipCullSlot
, tmpVector
);
497 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
499 vertices
[i
].attrib
[vertexClipCullSlot
] = tmpVector
[i
];
503 if (state
.backendState
.clipDistanceMask
& 0xf0)
505 pa
.Assemble(vertexClipCullSlot
+ 1, tmpVector
);
506 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
508 vertices
[i
].attrib
[vertexClipCullSlot
+ 1] = tmpVector
[i
];
512 uint32_t numAttribs
= maxSlot
+ 1;
514 typename
SIMD_T::Integer vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
516 BinnerChooser
<SIMD_T
> binner(NumVertsPerPrim
, pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
518 // set up new PA for binning clipped primitives
519 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
520 if (NumVertsPerPrim
== 3)
522 clipTopology
= TOP_TRIANGLE_FAN
;
524 // so that the binner knows to bloat wide points later
525 if (pa
.binTopology
== TOP_POINT_LIST
)
527 clipTopology
= TOP_POINT_LIST
;
530 else if (NumVertsPerPrim
== 2)
532 clipTopology
= TOP_LINE_LIST
;
536 SWR_ASSERT(0 && "Unexpected points in clipper.");
539 const uint32_t *pVertexCount
= reinterpret_cast<const uint32_t *>(&vNumClippedVerts
);
540 const uint32_t *pPrimitiveId
= reinterpret_cast<const uint32_t *>(&vPrimId
);
541 const uint32_t *pViewportIdx
= reinterpret_cast<const uint32_t *>(&vViewportIdx
);
542 const uint32_t *pRtIdx
= reinterpret_cast<const uint32_t *>(&vRtIdx
);
544 const SIMD256::Integer vOffsets
= SIMD256::set_epi32(
545 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>), // unused lane
546 6 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
547 5 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
548 4 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
549 3 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
550 2 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
551 1 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
552 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>));
554 // only need to gather 7 verts
555 // @todo dynamic mask based on actual # of verts generated per lane
556 const SIMD256::Float vMask
= SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
558 uint32_t numClippedPrims
= 0;
560 // tranpose clipper output so that each lane's vertices are in SIMD order
561 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
565 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
566 SIMDVERTEX_T
<SIMD_T
> *transposedPrims
= reinterpret_cast<SIMDVERTEX_T
<SIMD_T
> *>(AlignedMalloc(sizeof(SIMDVERTEX_T
<SIMD_T
>) * 2, 64));
569 SIMDVERTEX_T
<SIMD_T
> transposedPrims
[2];
572 uint32_t numInputPrims
= pa
.NumPrims();
573 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
575 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
576 if (numEmittedVerts
< NumVertsPerPrim
)
580 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
582 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
583 SWR_ASSERT(numEmittedPrims
<= 7, "Unexpected primitive count from clipper.");
585 numClippedPrims
+= numEmittedPrims
;
587 // tranpose clipper output so that each lane's vertices are in SIMD order
588 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
592 uint8_t *pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
595 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
596 static const float *dummy
= reinterpret_cast<const float *>(pBase
);
599 for (uint32_t c
= 0; c
< 4; ++c
)
601 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
602 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
603 pBase
+= sizeof(typename
SIMD_T::Float
);
607 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[backendState
.vertexAttribOffset
]) + sizeof(float) * inputPrim
;
609 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
611 uint32_t attribSlot
= backendState
.vertexAttribOffset
+ attrib
;
613 for (uint32_t c
= 0; c
< 4; ++c
)
615 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
616 transposedPrims
[0].attrib
[attribSlot
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
617 pBase
+= sizeof(typename
SIMD_T::Float
);
621 // transpose user clip distances if enabled
622 uint32_t vertexClipCullSlot
= backendState
.vertexClipCullOffset
;
623 if (state
.backendState
.clipDistanceMask
& 0x0f)
625 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[vertexClipCullSlot
]) + sizeof(float) * inputPrim
;
627 for (uint32_t c
= 0; c
< 4; ++c
)
629 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
630 transposedPrims
[0].attrib
[vertexClipCullSlot
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
631 pBase
+= sizeof(typename
SIMD_T::Float
);
635 if (state
.backendState
.clipDistanceMask
& 0xf0)
637 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[vertexClipCullSlot
+ 1]) + sizeof(float) * inputPrim
;
639 for (uint32_t c
= 0; c
< 4; ++c
)
641 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
642 transposedPrims
[0].attrib
[vertexClipCullSlot
+ 1][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
643 pBase
+= sizeof(typename
SIMD_T::Float
);
647 PA_STATE_OPT
clipPA(pDC
, numEmittedPrims
, reinterpret_cast<uint8_t *>(&transposedPrims
[0]), numEmittedVerts
, SWR_VTX_NUM_SLOTS
, true, NumVertsPerPrim
, clipTopology
);
648 clipPA
.viewportArrayActive
= pa
.viewportArrayActive
;
649 clipPA
.rtArrayActive
= pa
.rtArrayActive
;
651 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
653 const uint32_t primMask
= primMaskMap
[numEmittedPrims
];
655 const typename
SIMD_T::Integer primID
= SIMD_T::set1_epi32(pPrimitiveId
[inputPrim
]);
656 const typename
SIMD_T::Integer viewportIdx
= SIMD_T::set1_epi32(pViewportIdx
[inputPrim
]);
657 const typename
SIMD_T::Integer rtIdx
= SIMD_T::set1_epi32(pRtIdx
[inputPrim
]);
660 while (clipPA
.GetNextStreamOutput())
664 typename
SIMD_T::Vec4 attrib
[NumVertsPerPrim
];
666 bool assemble
= clipPA
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
670 binner
.pfnBinFunc(pDC
, clipPA
, workerId
, attrib
, primMask
, primID
, viewportIdx
, rtIdx
);
673 } while (clipPA
.NextPrim());
678 AlignedFree(transposedPrims
);
681 // update global pipeline stat
682 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
685 void ExecuteStage(PA_STATE
&pa
, typename
SIMD_T::Vec4 prim
[], uint32_t primMask
,
686 typename
SIMD_T::Integer
const &primId
, typename
SIMD_T::Integer
const &viewportIdx
, typename
SIMD_T::Integer
const &rtIdx
)
688 SWR_ASSERT(pa
.pDC
!= nullptr);
690 BinnerChooser
<SIMD_T
> binner(pa
.binTopology
, pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
692 // update clipper invocations pipeline stat
693 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
694 UPDATE_STAT_FE(CInvocations
, numInvoc
);
696 ComputeClipCodes(prim
, viewportIdx
);
698 // cull prims with NAN coords
699 primMask
&= ~ComputeNaNMask(prim
);
701 // user cull distance cull
702 if (state
.backendState
.cullDistanceMask
)
704 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
707 // cull prims outside view frustum
708 typename
SIMD_T::Float clipIntersection
= ComputeClipCodeIntersection();
709 int validMask
= primMask
& SimdHelper
<SIMD_T
>::cmpeq_ps_mask(clipIntersection
, SIMD_T::setzero_ps());
711 // skip clipping for points
712 uint32_t clipMask
= 0;
713 if (NumVertsPerPrim
!= 1)
715 clipMask
= primMask
& ComputeClipMask();
718 AR_EVENT(ClipInfoEvent(numInvoc
, validMask
, clipMask
));
722 RDTSC_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
723 // we have to clip tris, execute the clipper, which will also
725 ClipSimd(prim
, SIMD_T::vmask_ps(primMask
), SIMD_T::vmask_ps(clipMask
), pa
, primId
, viewportIdx
, rtIdx
);
726 RDTSC_END(FEGuardbandClip
, 1);
730 // update CPrimitives pipeline state
731 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
733 // forward valid prims directly to binner
734 binner
.pfnBinFunc(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
, viewportIdx
, rtIdx
);
739 typename
SIMD_T::Float
ComputeInterpFactor(typename
SIMD_T::Float
const &boundaryCoord0
, typename
SIMD_T::Float
const &boundaryCoord1
)
741 return SIMD_T::div_ps(boundaryCoord0
, SIMD_T::sub_ps(boundaryCoord0
, boundaryCoord1
));
744 typename
SIMD_T::Integer
ComputeOffsets(uint32_t attrib
, typename
SIMD_T::Integer
const &vIndices
, uint32_t component
)
746 const uint32_t simdVertexStride
= sizeof(SIMDVERTEX_T
<SIMD_T
>);
747 const uint32_t componentStride
= sizeof(typename
SIMD_T::Float
);
748 const uint32_t attribStride
= sizeof(typename
SIMD_T::Vec4
);
750 static const OSALIGNSIMD16(uint32_t) elemOffset
[16] =
770 static_assert(sizeof(typename
SIMD_T::Integer
) <= sizeof(elemOffset
), "Clipper::ComputeOffsets, Increase number of element offsets.");
772 typename
SIMD_T::Integer vElemOffset
= SIMD_T::loadu_si(reinterpret_cast<const typename
SIMD_T::Integer
*>(elemOffset
));
774 // step to the simdvertex
775 typename
SIMD_T::Integer vOffsets
= SIMD_T::mullo_epi32(vIndices
, SIMD_T::set1_epi32(simdVertexStride
));
777 // step to the attribute and component
778 vOffsets
= SIMD_T::add_epi32(vOffsets
, SIMD_T::set1_epi32(attribStride
* attrib
+ componentStride
* component
));
781 vOffsets
= SIMD_T::add_epi32(vOffsets
, vElemOffset
);
786 typename
SIMD_T::Float
GatherComponent(const float* pBuffer
, uint32_t attrib
, typename
SIMD_T::Float
const &vMask
, typename
SIMD_T::Integer
const &vIndices
, uint32_t component
)
788 typename
SIMD_T::Integer vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
789 typename
SIMD_T::Float vSrc
= SIMD_T::setzero_ps();
791 return SIMD_T::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(vSrc
, pBuffer
, vOffsets
, vMask
);
794 void ScatterComponent(const float* pBuffer
, uint32_t attrib
, typename
SIMD_T::Float
const &vMask
, typename
SIMD_T::Integer
const &vIndices
, uint32_t component
, typename
SIMD_T::Float
const &vSrc
)
796 typename
SIMD_T::Integer vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
798 const uint32_t *pOffsets
= reinterpret_cast<const uint32_t *>(&vOffsets
);
799 const float *pSrc
= reinterpret_cast<const float *>(&vSrc
);
800 uint32_t mask
= SIMD_T::movemask_ps(vMask
);
802 while (_BitScanForward(&lane
, mask
))
804 mask
&= ~(1 << lane
);
805 const uint8_t *pBuf
= reinterpret_cast<const uint8_t *>(pBuffer
) + pOffsets
[lane
];
806 *(float *)pBuf
= pSrc
[lane
];
810 template<SWR_CLIPCODES ClippingPlane
>
812 const typename
SIMD_T::Float
&vActiveMask
, // active lanes to operate on
813 const typename
SIMD_T::Integer
&s
, // index to first edge vertex v0 in pInPts.
814 const typename
SIMD_T::Integer
&p
, // index to second edge vertex v1 in pInPts.
815 const typename
SIMD_T::Vec4
&v1
, // vertex 0 position
816 const typename
SIMD_T::Vec4
&v2
, // vertex 1 position
817 typename
SIMD_T::Integer
&outIndex
, // output index.
818 const float *pInVerts
, // array of all the input positions.
819 uint32_t numInAttribs
, // number of attributes per vertex.
820 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
822 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
823 uint32_t vertexClipCullOffset
= this->state
.backendState
.vertexClipCullOffset
;
825 // compute interpolation factor
826 typename
SIMD_T::Float t
;
827 switch (ClippingPlane
)
829 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[0]), SIMD_T::add_ps(v2
[3], v2
[0])); break;
830 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[0]), SIMD_T::sub_ps(v2
[3], v2
[0])); break;
831 case FRUSTUM_TOP
: t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[1]), SIMD_T::add_ps(v2
[3], v2
[1])); break;
832 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[1]), SIMD_T::sub_ps(v2
[3], v2
[1])); break;
834 // DX Znear plane is 0, GL is -w
835 if (this->state
.rastState
.clipHalfZ
)
837 t
= ComputeInterpFactor(v1
[2], v2
[2]);
841 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[2]), SIMD_T::add_ps(v2
[3], v2
[2]));
844 case FRUSTUM_FAR
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[2]), SIMD_T::sub_ps(v2
[3], v2
[2])); break;
845 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
848 // interpolate position and store
849 for (uint32_t c
= 0; c
< 4; ++c
)
851 typename
SIMD_T::Float vOutPos
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
852 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
855 // interpolate attributes and store
856 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
858 uint32_t attribSlot
= vertexAttribOffset
+ a
;
859 for (uint32_t c
= 0; c
< 4; ++c
)
861 typename
SIMD_T::Float vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
862 typename
SIMD_T::Float vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
863 typename
SIMD_T::Float vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
864 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
868 // interpolate clip distance if enabled
869 if (this->state
.backendState
.clipDistanceMask
& 0xf)
871 uint32_t attribSlot
= vertexClipCullOffset
;
872 for (uint32_t c
= 0; c
< 4; ++c
)
874 typename
SIMD_T::Float vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
875 typename
SIMD_T::Float vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
876 typename
SIMD_T::Float vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
877 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
881 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
883 uint32_t attribSlot
= vertexClipCullOffset
+ 1;
884 for (uint32_t c
= 0; c
< 4; ++c
)
886 typename
SIMD_T::Float vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
887 typename
SIMD_T::Float vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
888 typename
SIMD_T::Float vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
889 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
894 template<SWR_CLIPCODES ClippingPlane
>
895 typename
SIMD_T::Float
inside(const typename
SIMD_T::Vec4
&v
)
897 switch (ClippingPlane
)
899 case FRUSTUM_LEFT
: return SIMD_T::cmpge_ps(v
[0], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
900 case FRUSTUM_RIGHT
: return SIMD_T::cmple_ps(v
[0], v
[3]);
901 case FRUSTUM_TOP
: return SIMD_T::cmpge_ps(v
[1], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
902 case FRUSTUM_BOTTOM
: return SIMD_T::cmple_ps(v
[1], v
[3]);
903 case FRUSTUM_NEAR
: return SIMD_T::cmpge_ps(v
[2], this->state
.rastState
.clipHalfZ
? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
904 case FRUSTUM_FAR
: return SIMD_T::cmple_ps(v
[2], v
[3]);
906 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
907 return SIMD_T::setzero_ps();
911 template<SWR_CLIPCODES ClippingPlane
>
912 typename
SIMD_T::Integer
ClipTriToPlane(const float *pInVerts
, const typename
SIMD_T::Integer
&vNumInPts
, uint32_t numInAttribs
, float *pOutVerts
)
914 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
916 typename
SIMD_T::Integer vCurIndex
= SIMD_T::setzero_si();
917 typename
SIMD_T::Integer vOutIndex
= SIMD_T::setzero_si();
918 typename
SIMD_T::Float vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
920 while (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
922 typename
SIMD_T::Integer s
= vCurIndex
;
923 typename
SIMD_T::Integer p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
924 typename
SIMD_T::Integer underFlowMask
= SIMD_T::cmpgt_epi32(vNumInPts
, p
);
925 p
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p
), SIMD_T::castsi_ps(underFlowMask
)));
928 typename
SIMD_T::Vec4 vInPos0
, vInPos1
;
929 for (uint32_t c
= 0; c
< 4; ++c
)
931 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
932 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
935 // compute inside mask
936 typename
SIMD_T::Float s_in
= inside
<ClippingPlane
>(vInPos0
);
937 typename
SIMD_T::Float p_in
= inside
<ClippingPlane
>(vInPos1
);
939 // compute intersection mask (s_in != p_in)
940 typename
SIMD_T::Float intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
941 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
944 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
945 if (!SIMD_T::testz_ps(s_in
, s_in
))
948 for (uint32_t c
= 0; c
< 4; ++c
)
950 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
954 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
956 uint32_t attribSlot
= vertexAttribOffset
+ a
;
957 for (uint32_t c
= 0; c
< 4; ++c
)
959 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
960 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
964 // store clip distance if enabled
965 uint32_t vertexClipCullSlot
= this->state
.backendState
.vertexClipCullOffset
;
966 if (this->state
.backendState
.clipDistanceMask
& 0xf)
968 uint32_t attribSlot
= vertexClipCullSlot
;
969 for (uint32_t c
= 0; c
< 4; ++c
)
971 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
972 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
976 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
978 uint32_t attribSlot
= vertexClipCullSlot
+ 1;
979 for (uint32_t c
= 0; c
< 4; ++c
)
981 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
982 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
986 // increment outIndex
987 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
990 // compute and store intersection
991 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
993 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
995 // increment outIndex for active lanes
996 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
999 // increment loop index and update active mask
1000 vCurIndex
= SIMD_T::add_epi32(vCurIndex
, SIMD_T::set1_epi32(1));
1001 vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1007 template<SWR_CLIPCODES ClippingPlane
>
1008 typename
SIMD_T::Integer
ClipLineToPlane(const float *pInVerts
, const typename
SIMD_T::Integer
&vNumInPts
, uint32_t numInAttribs
, float *pOutVerts
)
1010 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1012 typename
SIMD_T::Integer vCurIndex
= SIMD_T::setzero_si();
1013 typename
SIMD_T::Integer vOutIndex
= SIMD_T::setzero_si();
1014 typename
SIMD_T::Float vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1016 if (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
))
1018 typename
SIMD_T::Integer s
= vCurIndex
;
1019 typename
SIMD_T::Integer p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
1022 typename
SIMD_T::Vec4 vInPos0
, vInPos1
;
1023 for (uint32_t c
= 0; c
< 4; ++c
)
1025 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1026 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1029 // compute inside mask
1030 typename
SIMD_T::Float s_in
= inside
<ClippingPlane
>(vInPos0
);
1031 typename
SIMD_T::Float p_in
= inside
<ClippingPlane
>(vInPos1
);
1033 // compute intersection mask (s_in != p_in)
1034 typename
SIMD_T::Float intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
1035 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
1037 // store s if inside
1038 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
1039 if (!SIMD_T::testz_ps(s_in
, s_in
))
1041 for (uint32_t c
= 0; c
< 4; ++c
)
1043 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1046 // interpolate attributes and store
1047 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1049 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1050 for (uint32_t c
= 0; c
< 4; ++c
)
1052 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1053 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1057 // increment outIndex
1058 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1061 // compute and store intersection
1062 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1064 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1066 // increment outIndex for active lanes
1067 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1070 // store p if inside
1071 p_in
= SIMD_T::and_ps(p_in
, vActiveMask
);
1072 if (!SIMD_T::testz_ps(p_in
, p_in
))
1074 for (uint32_t c
= 0; c
< 4; ++c
)
1076 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
1079 // interpolate attributes and store
1080 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1082 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1083 for (uint32_t c
= 0; c
< 4; ++c
)
1085 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
1086 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
1090 // increment outIndex
1091 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), p_in
);
1098 typename
SIMD_T::Integer
ClipPrims(float *pVertices
, const typename
SIMD_T::Float
&vPrimMask
, const typename
SIMD_T::Float
&vClipMask
, int numAttribs
)
1101 float *pTempVerts
= reinterpret_cast<float *>(ClipHelper
<SIMD_T
>::GetTempVertices());
1103 // zero out num input verts for non-active lanes
1104 typename
SIMD_T::Integer vNumInPts
= SIMD_T::set1_epi32(NumVertsPerPrim
);
1105 vNumInPts
= SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts
, vClipMask
);
1107 // clip prims to frustum
1108 typename
SIMD_T::Integer vNumOutPts
;
1109 if (NumVertsPerPrim
== 3)
1111 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1112 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1113 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1114 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1115 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1116 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1120 SWR_ASSERT(NumVertsPerPrim
== 2);
1121 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1122 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1123 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1124 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1125 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1126 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1129 // restore num verts for non-clipped, active lanes
1130 typename
SIMD_T::Float vNonClippedMask
= SIMD_T::andnot_ps(vClipMask
, vPrimMask
);
1131 vNumOutPts
= SIMD_T::blendv_epi32(vNumOutPts
, SIMD_T::set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
1136 const uint32_t workerId
{ 0 };
1137 DRAW_CONTEXT
*pDC
{ nullptr };
1138 const API_STATE
&state
;
1139 typename
SIMD_T::Float clipCodes
[NumVertsPerPrim
];
1143 // pipeline stage functions
1144 void ClipTriangles(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
1145 void ClipLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
1146 void ClipPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
1147 #if USE_SIMD16_FRONTEND
1148 void SIMDCALL
ClipTriangles_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);
1149 void SIMDCALL
ClipLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);
1150 void SIMDCALL
ClipPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);