1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T
<SIMD256
> tlsTempVertices
[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T
<SIMD512
> tlsTempVertices_simd16
[7];
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
47 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
48 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
49 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
51 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
52 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
54 NEGW
= (0x40 << CLIPCODE_SHIFT
),
56 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
57 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
58 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
59 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63 #define FRUSTUM_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|FRUSTUM_LEFT|FRUSTUM_RIGHT|FRUSTUM_TOP|FRUSTUM_BOTTOM)
65 template<typename SIMD_T
>
66 void ComputeClipCodes(const API_STATE
&state
, const Vec4
<SIMD_T
> &vertex
, Float
<SIMD_T
> &clipCodes
, Integer
<SIMD_T
> const &viewportIndexes
)
68 clipCodes
= SIMD_T::setzero_ps();
71 Float
<SIMD_T
> vNegW
= SIMD_T::mul_ps(vertex
.w
,SIMD_T::set1_ps(-1.0f
));
74 Float
<SIMD_T
> vRes
= SIMD_T::cmplt_ps(vertex
.x
, vNegW
);
75 clipCodes
= SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT
)));
78 vRes
= SIMD_T::cmplt_ps(vertex
.y
, vNegW
);
79 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP
))));
82 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, vertex
.w
);
83 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT
))));
86 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, vertex
.w
);
87 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM
))));
89 if (state
.rastState
.depthClipEnable
)
92 // DX clips depth [0..w], GL clips [-w..w]
93 if (state
.rastState
.clipHalfZ
)
95 vRes
= SIMD_T::cmplt_ps(vertex
.z
, SIMD_T::setzero_ps());
99 vRes
= SIMD_T::cmplt_ps(vertex
.z
, vNegW
);
101 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR
))));
104 vRes
= SIMD_T::cmpgt_ps(vertex
.z
, vertex
.w
);
105 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR
))));
109 vRes
= SIMD_T::cmple_ps(vertex
.w
, SIMD_T::setzero_ps());
110 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW
))));
113 Float
<SIMD_T
> gbMult
= SIMD_T::mul_ps(vNegW
, SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(&state
.gbState
.left
[0], viewportIndexes
));
114 vRes
= SIMD_T::cmplt_ps(vertex
.x
, gbMult
);
115 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT
))));
118 gbMult
= SIMD_T::mul_ps(vNegW
, SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(&state
.gbState
.top
[0], viewportIndexes
));
119 vRes
= SIMD_T::cmplt_ps(vertex
.y
, gbMult
);
120 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP
))));
123 gbMult
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(&state
.gbState
.right
[0], viewportIndexes
));
124 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, gbMult
);
125 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT
))));
128 gbMult
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(&state
.gbState
.bottom
[0], viewportIndexes
));
129 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, gbMult
);
130 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM
))));
133 template<typename SIMD_T
>
139 struct BinnerChooser
<SIMD256
>
141 PFN_PROCESS_PRIMS pfnBinFunc
;
143 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
146 if (numVertsPerPrim
== 3)
148 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
151 else if (numVertsPerPrim
== 2)
153 pfnBinFunc
= BinLines
;
157 SWR_ASSERT(0 && "Unexpected points in clipper.");
161 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
167 pfnBinFunc
= BinPoints
;
172 case TOP_LINE_LIST_ADJ
:
173 case TOP_LISTSTRIP_ADJ
:
174 pfnBinFunc
= BinLines
;
177 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
182 void BinFunc(DRAW_CONTEXT
*pDC
, PA_STATE
&pa
, uint32_t workerId
, SIMD256::Vec4 prims
[], uint32_t primMask
, SIMD256::Integer
const &primID
, SIMD256::Integer
&viewportIdx
, SIMD256::Integer
&rtIdx
)
184 SWR_ASSERT(pfnBinFunc
!= nullptr);
186 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
190 #if USE_SIMD16_FRONTEND
192 struct BinnerChooser
<SIMD512
>
194 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc
;
196 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
199 if (numVertsPerPrim
== 3)
201 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
204 else if (numVertsPerPrim
== 2)
206 pfnBinFunc
= BinLines_simd16
;
210 SWR_ASSERT(0 && "Unexpected points in clipper.");
214 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
220 pfnBinFunc
= BinPoints_simd16
;
225 case TOP_LINE_LIST_ADJ
:
226 case TOP_LISTSTRIP_ADJ
:
227 pfnBinFunc
= BinLines_simd16
;
230 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
235 void BinFunc(DRAW_CONTEXT
*pDC
, PA_STATE
&pa
, uint32_t workerId
, SIMD512::Vec4 prims
[], uint32_t primMask
, SIMD512::Integer
const &primID
, SIMD512::Integer
&viewportIdx
, SIMD512::Integer
&rtIdx
)
237 SWR_ASSERT(pfnBinFunc
!= nullptr);
239 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
244 template<typename SIMD_T
>
250 struct SimdHelper
<SIMD256
>
252 static SIMD256::Float
insert_lo_ps(SIMD256::Float a
)
257 static SIMD256::Mask
cmpeq_ps_mask(SIMD256::Float a
, SIMD256::Float b
)
259 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a
, b
));
263 #if USE_SIMD16_FRONTEND
265 struct SimdHelper
<SIMD512
>
267 static SIMD512::Float
insert_lo_ps(SIMD256::Float a
)
269 return SIMD512::insert_ps
<0>(SIMD512::setzero_ps(), a
);
272 static SIMD512::Mask
cmpeq_ps_mask(SIMD512::Float a
, SIMD512::Float b
)
274 return SIMD512::cmp_ps_mask
<SIMD16::CompareType::EQ_OQ
>(a
, b
);
279 // Temp storage used by the clipper
280 template<typename SIMD_T
>
286 struct ClipHelper
<SIMD256
>
288 static SIMDVERTEX_T
<SIMD256
> *GetTempVertices()
290 return tlsTempVertices
;
294 #if USE_SIMD16_FRONTEND
296 struct ClipHelper
<SIMD512
>
298 static SIMDVERTEX_T
<SIMD512
> *GetTempVertices()
300 return tlsTempVertices_simd16
;
305 template<typename SIMD_T
, uint32_t NumVertsPerPrim
>
309 INLINE
Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
310 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
312 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
315 void ComputeClipCodes(Vec4
<SIMD_T
> vertex
[], const Integer
<SIMD_T
> &viewportIndexes
)
317 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
319 ::ComputeClipCodes
<SIMD_T
>(state
, vertex
[i
], clipCodes
[i
], viewportIndexes
);
323 Float
<SIMD_T
> ComputeClipCodeIntersection()
325 Float
<SIMD_T
> result
= clipCodes
[0];
327 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
329 result
= SIMD_T::and_ps(result
, clipCodes
[i
]);
335 Float
<SIMD_T
> ComputeClipCodeUnion()
337 Float
<SIMD_T
> result
= clipCodes
[0];
339 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
341 result
= SIMD_T::or_ps(result
, clipCodes
[i
]);
347 int ComputeClipMask()
349 Float
<SIMD_T
> clipUnion
= ComputeClipCodeUnion();
351 clipUnion
= SIMD_T::and_ps(clipUnion
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK
)));
353 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion
, SIMD_T::setzero_ps()));
356 // clipper is responsible for culling any prims with NAN coordinates
357 int ComputeNaNMask(Vec4
<SIMD_T
> prim
[])
359 Float
<SIMD_T
> vNanMask
= SIMD_T::setzero_ps();
361 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
363 Float
<SIMD_T
> vNan01
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[0], prim
[e
].v
[1]);
364 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan01
);
366 Float
<SIMD_T
> vNan23
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[2], prim
[e
].v
[3]);
367 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan23
);
370 return SIMD_T::movemask_ps(vNanMask
);
373 int ComputeUserClipCullMask(PA_STATE
&pa
, Vec4
<SIMD_T
> prim
[])
375 uint8_t cullMask
= state
.backendState
.cullDistanceMask
;
376 uint32_t vertexClipCullOffset
= state
.backendState
.vertexClipCullOffset
;
378 Float
<SIMD_T
> vClipCullMask
= SIMD_T::setzero_ps();
380 Vec4
<SIMD_T
> vClipCullDistLo
[3];
381 Vec4
<SIMD_T
> vClipCullDistHi
[3];
383 pa
.Assemble(vertexClipCullOffset
, vClipCullDistLo
);
384 pa
.Assemble(vertexClipCullOffset
+ 1, vClipCullDistHi
);
387 while (_BitScanForward(&index
, cullMask
))
389 cullMask
&= ~(1 << index
);
390 uint32_t slot
= index
>> 2;
391 uint32_t component
= index
& 0x3;
393 Float
<SIMD_T
> vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
394 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
396 Float
<SIMD_T
> vCullComp
;
399 vCullComp
= vClipCullDistLo
[e
][component
];
403 vCullComp
= vClipCullDistHi
[e
][component
];
406 // cull if cull distance < 0 || NAN
407 Float
<SIMD_T
> vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(SIMD_T::setzero_ps(), vCullComp
);
408 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
410 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
413 // clipper should also discard any primitive with NAN clip distance
414 uint8_t clipMask
= state
.backendState
.clipDistanceMask
;
415 while (_BitScanForward(&index
, clipMask
))
417 clipMask
&= ~(1 << index
);
418 uint32_t slot
= index
>> 2;
419 uint32_t component
= index
& 0x3;
421 Float
<SIMD_T
> vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
422 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
424 Float
<SIMD_T
> vClipComp
;
427 vClipComp
= vClipCullDistLo
[e
][component
];
431 vClipComp
= vClipCullDistHi
[e
][component
];
434 Float
<SIMD_T
> vClip
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(vClipComp
, vClipComp
);
435 Float
<SIMD_T
> vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(SIMD_T::setzero_ps(), vClipComp
);
436 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
437 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vClip
);
439 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
442 return SIMD_T::movemask_ps(vClipCullMask
);
445 void ClipSimd(const Vec4
<SIMD_T
> prim
[], const Float
<SIMD_T
> &vPrimMask
, const Float
<SIMD_T
> &vClipMask
, PA_STATE
&pa
,
446 const Integer
<SIMD_T
> &vPrimId
, const Integer
<SIMD_T
> &vViewportIdx
, const Integer
<SIMD_T
> &vRtIdx
)
448 // input/output vertex store for clipper
449 SIMDVERTEX_T
<SIMD_T
> vertices
[7]; // maximum 7 verts generated per triangle
451 uint32_t constantInterpMask
= state
.backendState
.constantInterpolationMask
;
452 uint32_t provokingVertex
= 0;
453 if (pa
.binTopology
== TOP_TRIANGLE_FAN
)
455 provokingVertex
= state
.frontendState
.provokingVertex
.triFan
;
457 ///@todo: line topology for wireframe?
460 Vec4
<SIMD_T
> tmpVector
[NumVertsPerPrim
];
461 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
463 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = prim
[i
];
467 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
469 int32_t maxSlot
= -1;
470 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
472 // Compute absolute attrib slot in vertex array
473 uint32_t mapSlot
= backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
474 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
475 uint32_t inputSlot
= backendState
.vertexAttribOffset
+ mapSlot
;
477 pa
.Assemble(inputSlot
, tmpVector
);
479 // if constant interpolation enabled for this attribute, assign the provoking
480 // vertex values to all edges
481 if (CheckBit(constantInterpMask
, slot
))
483 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
485 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
490 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
492 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
497 // assemble user clip distances if enabled
498 uint32_t vertexClipCullSlot
= state
.backendState
.vertexClipCullOffset
;
499 if (state
.backendState
.clipDistanceMask
& 0xf)
501 pa
.Assemble(vertexClipCullSlot
, tmpVector
);
502 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
504 vertices
[i
].attrib
[vertexClipCullSlot
] = tmpVector
[i
];
508 if (state
.backendState
.clipDistanceMask
& 0xf0)
510 pa
.Assemble(vertexClipCullSlot
+ 1, tmpVector
);
511 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
513 vertices
[i
].attrib
[vertexClipCullSlot
+ 1] = tmpVector
[i
];
517 uint32_t numAttribs
= maxSlot
+ 1;
519 Integer
<SIMD_T
> vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
521 BinnerChooser
<SIMD_T
> binner(NumVertsPerPrim
, pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
523 // set up new PA for binning clipped primitives
524 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
525 if (NumVertsPerPrim
== 3)
527 clipTopology
= TOP_TRIANGLE_FAN
;
529 // so that the binner knows to bloat wide points later
530 if (pa
.binTopology
== TOP_POINT_LIST
)
532 clipTopology
= TOP_POINT_LIST
;
535 else if (NumVertsPerPrim
== 2)
537 clipTopology
= TOP_LINE_LIST
;
541 SWR_ASSERT(0 && "Unexpected points in clipper.");
544 const uint32_t *pVertexCount
= reinterpret_cast<const uint32_t *>(&vNumClippedVerts
);
545 const uint32_t *pPrimitiveId
= reinterpret_cast<const uint32_t *>(&vPrimId
);
546 const uint32_t *pViewportIdx
= reinterpret_cast<const uint32_t *>(&vViewportIdx
);
547 const uint32_t *pRtIdx
= reinterpret_cast<const uint32_t *>(&vRtIdx
);
549 const SIMD256::Integer vOffsets
= SIMD256::set_epi32(
550 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>), // unused lane
551 6 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
552 5 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
553 4 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
554 3 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
555 2 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
556 1 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
557 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>));
559 // only need to gather 7 verts
560 // @todo dynamic mask based on actual # of verts generated per lane
561 const SIMD256::Float vMask
= SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
563 uint32_t numClippedPrims
= 0;
565 // tranpose clipper output so that each lane's vertices are in SIMD order
566 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
570 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
571 SIMDVERTEX_T
<SIMD_T
> *transposedPrims
= reinterpret_cast<SIMDVERTEX_T
<SIMD_T
> *>(AlignedMalloc(sizeof(SIMDVERTEX_T
<SIMD_T
>) * 2, 64));
574 SIMDVERTEX_T
<SIMD_T
> transposedPrims
[2];
577 uint32_t numInputPrims
= pa
.NumPrims();
578 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
580 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
581 if (numEmittedVerts
< NumVertsPerPrim
)
585 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
587 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
588 SWR_ASSERT(numEmittedPrims
<= 7, "Unexpected primitive count from clipper.");
590 numClippedPrims
+= numEmittedPrims
;
592 // tranpose clipper output so that each lane's vertices are in SIMD order
593 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
597 uint8_t *pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
600 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
601 static const float *dummy
= reinterpret_cast<const float *>(pBase
);
604 for (uint32_t c
= 0; c
< 4; ++c
)
606 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
607 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
608 pBase
+= sizeof(Float
<SIMD_T
>);
612 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[backendState
.vertexAttribOffset
]) + sizeof(float) * inputPrim
;
614 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
616 uint32_t attribSlot
= backendState
.vertexAttribOffset
+ attrib
;
618 for (uint32_t c
= 0; c
< 4; ++c
)
620 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
621 transposedPrims
[0].attrib
[attribSlot
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
622 pBase
+= sizeof(Float
<SIMD_T
>);
626 // transpose user clip distances if enabled
627 uint32_t vertexClipCullSlot
= backendState
.vertexClipCullOffset
;
628 if (state
.backendState
.clipDistanceMask
& 0x0f)
630 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[vertexClipCullSlot
]) + sizeof(float) * inputPrim
;
632 for (uint32_t c
= 0; c
< 4; ++c
)
634 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
635 transposedPrims
[0].attrib
[vertexClipCullSlot
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
636 pBase
+= sizeof(Float
<SIMD_T
>);
640 if (state
.backendState
.clipDistanceMask
& 0xf0)
642 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[vertexClipCullSlot
+ 1]) + sizeof(float) * inputPrim
;
644 for (uint32_t c
= 0; c
< 4; ++c
)
646 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
647 transposedPrims
[0].attrib
[vertexClipCullSlot
+ 1][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
648 pBase
+= sizeof(Float
<SIMD_T
>);
652 PA_STATE_OPT
clipPA(pDC
, numEmittedPrims
, reinterpret_cast<uint8_t *>(&transposedPrims
[0]), numEmittedVerts
, SWR_VTX_NUM_SLOTS
, true, NumVertsPerPrim
, clipTopology
);
653 clipPA
.viewportArrayActive
= pa
.viewportArrayActive
;
654 clipPA
.rtArrayActive
= pa
.rtArrayActive
;
656 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
658 const uint32_t primMask
= primMaskMap
[numEmittedPrims
];
660 const Integer
<SIMD_T
> primID
= SIMD_T::set1_epi32(pPrimitiveId
[inputPrim
]);
661 const Integer
<SIMD_T
> viewportIdx
= SIMD_T::set1_epi32(pViewportIdx
[inputPrim
]);
662 const Integer
<SIMD_T
> rtIdx
= SIMD_T::set1_epi32(pRtIdx
[inputPrim
]);
665 while (clipPA
.GetNextStreamOutput())
669 Vec4
<SIMD_T
> attrib
[NumVertsPerPrim
];
671 bool assemble
= clipPA
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
675 binner
.pfnBinFunc(pDC
, clipPA
, workerId
, attrib
, primMask
, primID
, viewportIdx
, rtIdx
);
678 } while (clipPA
.NextPrim());
683 AlignedFree(transposedPrims
);
686 // update global pipeline stat
687 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
690 void ExecuteStage(PA_STATE
&pa
, Vec4
<SIMD_T
> prim
[], uint32_t primMask
,
691 Integer
<SIMD_T
> const &primId
, Integer
<SIMD_T
> const &viewportIdx
, Integer
<SIMD_T
> const &rtIdx
)
693 SWR_ASSERT(pa
.pDC
!= nullptr);
695 BinnerChooser
<SIMD_T
> binner(pa
.binTopology
, pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
697 // update clipper invocations pipeline stat
698 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
699 UPDATE_STAT_FE(CInvocations
, numInvoc
);
701 ComputeClipCodes(prim
, viewportIdx
);
703 // cull prims with NAN coords
704 primMask
&= ~ComputeNaNMask(prim
);
706 // user cull distance cull
707 if (state
.backendState
.cullDistanceMask
| state
.backendState
.clipDistanceMask
)
709 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
712 Float
<SIMD_T
> clipIntersection
= ComputeClipCodeIntersection();
713 // Mask out non-frustum codes
714 clipIntersection
= SIMD_T::and_ps(clipIntersection
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK
)));
716 // cull prims outside view frustum
717 int validMask
= primMask
& SimdHelper
<SIMD_T
>::cmpeq_ps_mask(clipIntersection
, SIMD_T::setzero_ps());
719 // skip clipping for points
720 uint32_t clipMask
= 0;
721 if (NumVertsPerPrim
!= 1)
723 clipMask
= validMask
& ComputeClipMask();
726 AR_EVENT(ClipInfoEvent(numInvoc
, validMask
, clipMask
));
730 RDTSC_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
731 // we have to clip tris, execute the clipper, which will also
733 ClipSimd(prim
, SIMD_T::vmask_ps(validMask
), SIMD_T::vmask_ps(clipMask
), pa
, primId
, viewportIdx
, rtIdx
);
734 RDTSC_END(FEGuardbandClip
, 1);
738 // update CPrimitives pipeline state
739 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
741 // forward valid prims directly to binner
742 binner
.pfnBinFunc(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
, viewportIdx
, rtIdx
);
747 Float
<SIMD_T
> ComputeInterpFactor(Float
<SIMD_T
> const &boundaryCoord0
, Float
<SIMD_T
> const &boundaryCoord1
)
749 return SIMD_T::div_ps(boundaryCoord0
, SIMD_T::sub_ps(boundaryCoord0
, boundaryCoord1
));
752 Integer
<SIMD_T
> ComputeOffsets(uint32_t attrib
, Integer
<SIMD_T
> const &vIndices
, uint32_t component
)
754 const uint32_t simdVertexStride
= sizeof(SIMDVERTEX_T
<SIMD_T
>);
755 const uint32_t componentStride
= sizeof(Float
<SIMD_T
>);
756 const uint32_t attribStride
= sizeof(Vec4
<SIMD_T
>);
758 static const OSALIGNSIMD16(uint32_t) elemOffset
[16] =
778 static_assert(sizeof(Integer
<SIMD_T
>) <= sizeof(elemOffset
), "Clipper::ComputeOffsets, Increase number of element offsets.");
780 Integer
<SIMD_T
> vElemOffset
= SIMD_T::loadu_si(reinterpret_cast<const Integer
<SIMD_T
> *>(elemOffset
));
782 // step to the simdvertex
783 Integer
<SIMD_T
> vOffsets
= SIMD_T::mullo_epi32(vIndices
, SIMD_T::set1_epi32(simdVertexStride
));
785 // step to the attribute and component
786 vOffsets
= SIMD_T::add_epi32(vOffsets
, SIMD_T::set1_epi32(attribStride
* attrib
+ componentStride
* component
));
789 vOffsets
= SIMD_T::add_epi32(vOffsets
, vElemOffset
);
794 Float
<SIMD_T
> GatherComponent(const float* pBuffer
, uint32_t attrib
, Float
<SIMD_T
> const &vMask
, Integer
<SIMD_T
> const &vIndices
, uint32_t component
)
796 Integer
<SIMD_T
> vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
797 Float
<SIMD_T
> vSrc
= SIMD_T::setzero_ps();
799 return SIMD_T::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(vSrc
, pBuffer
, vOffsets
, vMask
);
802 void ScatterComponent(const float* pBuffer
, uint32_t attrib
, Float
<SIMD_T
> const &vMask
, Integer
<SIMD_T
> const &vIndices
, uint32_t component
, Float
<SIMD_T
> const &vSrc
)
804 Integer
<SIMD_T
> vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
806 const uint32_t *pOffsets
= reinterpret_cast<const uint32_t *>(&vOffsets
);
807 const float *pSrc
= reinterpret_cast<const float *>(&vSrc
);
808 uint32_t mask
= SIMD_T::movemask_ps(vMask
);
810 while (_BitScanForward(&lane
, mask
))
812 mask
&= ~(1 << lane
);
813 const uint8_t *pBuf
= reinterpret_cast<const uint8_t *>(pBuffer
) + pOffsets
[lane
];
814 *(float *)pBuf
= pSrc
[lane
];
818 template<SWR_CLIPCODES ClippingPlane
>
820 const Float
<SIMD_T
> &vActiveMask
, // active lanes to operate on
821 const Integer
<SIMD_T
> &s
, // index to first edge vertex v0 in pInPts.
822 const Integer
<SIMD_T
> &p
, // index to second edge vertex v1 in pInPts.
823 const Vec4
<SIMD_T
> &v1
, // vertex 0 position
824 const Vec4
<SIMD_T
> &v2
, // vertex 1 position
825 Integer
<SIMD_T
> &outIndex
, // output index.
826 const float *pInVerts
, // array of all the input positions.
827 uint32_t numInAttribs
, // number of attributes per vertex.
828 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
830 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
831 uint32_t vertexClipCullOffset
= this->state
.backendState
.vertexClipCullOffset
;
833 // compute interpolation factor
835 switch (ClippingPlane
)
837 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[0]), SIMD_T::add_ps(v2
[3], v2
[0])); break;
838 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[0]), SIMD_T::sub_ps(v2
[3], v2
[0])); break;
839 case FRUSTUM_TOP
: t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[1]), SIMD_T::add_ps(v2
[3], v2
[1])); break;
840 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[1]), SIMD_T::sub_ps(v2
[3], v2
[1])); break;
842 // DX Znear plane is 0, GL is -w
843 if (this->state
.rastState
.clipHalfZ
)
845 t
= ComputeInterpFactor(v1
[2], v2
[2]);
849 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[2]), SIMD_T::add_ps(v2
[3], v2
[2]));
852 case FRUSTUM_FAR
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[2]), SIMD_T::sub_ps(v2
[3], v2
[2])); break;
853 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
856 // interpolate position and store
857 for (uint32_t c
= 0; c
< 4; ++c
)
859 Float
<SIMD_T
> vOutPos
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
860 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
863 // interpolate attributes and store
864 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
866 uint32_t attribSlot
= vertexAttribOffset
+ a
;
867 for (uint32_t c
= 0; c
< 4; ++c
)
869 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
870 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
871 Float
<SIMD_T
> vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
872 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
876 // interpolate clip distance if enabled
877 if (this->state
.backendState
.clipDistanceMask
& 0xf)
879 uint32_t attribSlot
= vertexClipCullOffset
;
880 for (uint32_t c
= 0; c
< 4; ++c
)
882 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
883 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
884 Float
<SIMD_T
> vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
885 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
889 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
891 uint32_t attribSlot
= vertexClipCullOffset
+ 1;
892 for (uint32_t c
= 0; c
< 4; ++c
)
894 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
895 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
896 Float
<SIMD_T
> vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
897 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
902 template<SWR_CLIPCODES ClippingPlane
>
903 Float
<SIMD_T
> inside(const Vec4
<SIMD_T
> &v
)
905 switch (ClippingPlane
)
907 case FRUSTUM_LEFT
: return SIMD_T::cmpge_ps(v
[0], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
908 case FRUSTUM_RIGHT
: return SIMD_T::cmple_ps(v
[0], v
[3]);
909 case FRUSTUM_TOP
: return SIMD_T::cmpge_ps(v
[1], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
910 case FRUSTUM_BOTTOM
: return SIMD_T::cmple_ps(v
[1], v
[3]);
911 case FRUSTUM_NEAR
: return SIMD_T::cmpge_ps(v
[2], this->state
.rastState
.clipHalfZ
? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
912 case FRUSTUM_FAR
: return SIMD_T::cmple_ps(v
[2], v
[3]);
914 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
915 return SIMD_T::setzero_ps();
919 template<SWR_CLIPCODES ClippingPlane
>
920 Integer
<SIMD_T
> ClipTriToPlane(const float *pInVerts
, const Integer
<SIMD_T
> &vNumInPts
, uint32_t numInAttribs
, float *pOutVerts
)
922 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
924 Integer
<SIMD_T
> vCurIndex
= SIMD_T::setzero_si();
925 Integer
<SIMD_T
> vOutIndex
= SIMD_T::setzero_si();
926 Float
<SIMD_T
> vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
928 while (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
930 Integer
<SIMD_T
> s
= vCurIndex
;
931 Integer
<SIMD_T
> p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
932 Integer
<SIMD_T
> underFlowMask
= SIMD_T::cmpgt_epi32(vNumInPts
, p
);
933 p
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p
), SIMD_T::castsi_ps(underFlowMask
)));
936 Vec4
<SIMD_T
> vInPos0
, vInPos1
;
937 for (uint32_t c
= 0; c
< 4; ++c
)
939 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
940 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
943 // compute inside mask
944 Float
<SIMD_T
> s_in
= inside
<ClippingPlane
>(vInPos0
);
945 Float
<SIMD_T
> p_in
= inside
<ClippingPlane
>(vInPos1
);
947 // compute intersection mask (s_in != p_in)
948 Float
<SIMD_T
> intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
949 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
952 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
953 if (!SIMD_T::testz_ps(s_in
, s_in
))
956 for (uint32_t c
= 0; c
< 4; ++c
)
958 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
962 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
964 uint32_t attribSlot
= vertexAttribOffset
+ a
;
965 for (uint32_t c
= 0; c
< 4; ++c
)
967 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
968 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
972 // store clip distance if enabled
973 uint32_t vertexClipCullSlot
= this->state
.backendState
.vertexClipCullOffset
;
974 if (this->state
.backendState
.clipDistanceMask
& 0xf)
976 uint32_t attribSlot
= vertexClipCullSlot
;
977 for (uint32_t c
= 0; c
< 4; ++c
)
979 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
980 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
984 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
986 uint32_t attribSlot
= vertexClipCullSlot
+ 1;
987 for (uint32_t c
= 0; c
< 4; ++c
)
989 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
990 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
994 // increment outIndex
995 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
998 // compute and store intersection
999 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1001 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1003 // increment outIndex for active lanes
1004 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1007 // increment loop index and update active mask
1008 vCurIndex
= SIMD_T::add_epi32(vCurIndex
, SIMD_T::set1_epi32(1));
1009 vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1015 template<SWR_CLIPCODES ClippingPlane
>
1016 Integer
<SIMD_T
> ClipLineToPlane(const float *pInVerts
, const Integer
<SIMD_T
> &vNumInPts
, uint32_t numInAttribs
, float *pOutVerts
)
1018 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1020 Integer
<SIMD_T
> vCurIndex
= SIMD_T::setzero_si();
1021 Integer
<SIMD_T
> vOutIndex
= SIMD_T::setzero_si();
1022 Float
<SIMD_T
> vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1024 if (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
))
1026 Integer
<SIMD_T
> s
= vCurIndex
;
1027 Integer
<SIMD_T
> p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
1030 Vec4
<SIMD_T
> vInPos0
, vInPos1
;
1031 for (uint32_t c
= 0; c
< 4; ++c
)
1033 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1034 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1037 // compute inside mask
1038 Float
<SIMD_T
> s_in
= inside
<ClippingPlane
>(vInPos0
);
1039 Float
<SIMD_T
> p_in
= inside
<ClippingPlane
>(vInPos1
);
1041 // compute intersection mask (s_in != p_in)
1042 Float
<SIMD_T
> intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
1043 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
1045 // store s if inside
1046 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
1047 if (!SIMD_T::testz_ps(s_in
, s_in
))
1049 for (uint32_t c
= 0; c
< 4; ++c
)
1051 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1054 // interpolate attributes and store
1055 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1057 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1058 for (uint32_t c
= 0; c
< 4; ++c
)
1060 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1061 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1065 // increment outIndex
1066 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1069 // compute and store intersection
1070 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1072 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1074 // increment outIndex for active lanes
1075 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1078 // store p if inside
1079 p_in
= SIMD_T::and_ps(p_in
, vActiveMask
);
1080 if (!SIMD_T::testz_ps(p_in
, p_in
))
1082 for (uint32_t c
= 0; c
< 4; ++c
)
1084 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
1087 // interpolate attributes and store
1088 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1090 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1091 for (uint32_t c
= 0; c
< 4; ++c
)
1093 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
1094 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
1098 // increment outIndex
1099 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), p_in
);
1106 Integer
<SIMD_T
> ClipPrims(float *pVertices
, const Float
<SIMD_T
> &vPrimMask
, const Float
<SIMD_T
> &vClipMask
, int numAttribs
)
1109 float *pTempVerts
= reinterpret_cast<float *>(ClipHelper
<SIMD_T
>::GetTempVertices());
1111 // zero out num input verts for non-active lanes
1112 Integer
<SIMD_T
> vNumInPts
= SIMD_T::set1_epi32(NumVertsPerPrim
);
1113 vNumInPts
= SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts
, vClipMask
);
1115 // clip prims to frustum
1116 Integer
<SIMD_T
> vNumOutPts
;
1117 if (NumVertsPerPrim
== 3)
1119 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1120 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1121 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1122 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1123 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1124 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1128 SWR_ASSERT(NumVertsPerPrim
== 2);
1129 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1130 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1131 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1132 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1133 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1134 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1137 // restore num verts for non-clipped, active lanes
1138 Float
<SIMD_T
> vNonClippedMask
= SIMD_T::andnot_ps(vClipMask
, vPrimMask
);
1139 vNumOutPts
= SIMD_T::blendv_epi32(vNumOutPts
, SIMD_T::set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
1144 const uint32_t workerId
{ 0 };
1145 DRAW_CONTEXT
*pDC
{ nullptr };
1146 const API_STATE
&state
;
1147 Float
<SIMD_T
> clipCodes
[NumVertsPerPrim
];
1151 // pipeline stage functions
1152 void ClipTriangles(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
1153 void ClipLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
1154 void ClipPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
1155 #if USE_SIMD16_FRONTEND
1156 void SIMDCALL
ClipTriangles_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);
1157 void SIMDCALL
ClipLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);
1158 void SIMDCALL
ClipPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);