1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T
<SIMD256
> tlsTempVertices
[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T
<SIMD512
> tlsTempVertices_simd16
[7];
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
45 // rather than intersection, of clipcodes.
46 #define CLIPCODE_SHIFT 23
47 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
48 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
49 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
50 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
52 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
53 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
55 NEGW
= (0x40 << CLIPCODE_SHIFT
),
57 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
58 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
59 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
60 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
63 #define GUARDBAND_CLIP_MASK \
64 (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
65 GUARDBAND_BOTTOM | NEGW)
66 #define FRUSTUM_CLIP_MASK \
67 (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
69 template <typename SIMD_T
>
70 void ComputeClipCodes(const API_STATE
& state
,
71 const Vec4
<SIMD_T
>& vertex
,
72 Float
<SIMD_T
>& clipCodes
,
73 Integer
<SIMD_T
> const& viewportIndexes
)
75 clipCodes
= SIMD_T::setzero_ps();
78 Float
<SIMD_T
> vNegW
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::set1_ps(-1.0f
));
81 Float
<SIMD_T
> vRes
= SIMD_T::cmplt_ps(vertex
.x
, vNegW
);
82 clipCodes
= SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT
)));
85 vRes
= SIMD_T::cmplt_ps(vertex
.y
, vNegW
);
86 clipCodes
= SIMD_T::or_ps(
87 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP
))));
90 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, vertex
.w
);
91 clipCodes
= SIMD_T::or_ps(
92 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT
))));
95 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, vertex
.w
);
96 clipCodes
= SIMD_T::or_ps(
97 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM
))));
99 if (state
.rastState
.depthClipEnable
)
102 // DX clips depth [0..w], GL clips [-w..w]
103 if (state
.rastState
.clipHalfZ
)
105 vRes
= SIMD_T::cmplt_ps(vertex
.z
, SIMD_T::setzero_ps());
109 vRes
= SIMD_T::cmplt_ps(vertex
.z
, vNegW
);
111 clipCodes
= SIMD_T::or_ps(
112 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR
))));
115 vRes
= SIMD_T::cmpgt_ps(vertex
.z
, vertex
.w
);
116 clipCodes
= SIMD_T::or_ps(
117 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR
))));
121 vRes
= SIMD_T::cmple_ps(vertex
.w
, SIMD_T::setzero_ps());
123 SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW
))));
126 Float
<SIMD_T
> gbMult
= SIMD_T::mul_ps(vNegW
,
127 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
128 &state
.gbState
.left
[0], viewportIndexes
));
129 vRes
= SIMD_T::cmplt_ps(vertex
.x
, gbMult
);
130 clipCodes
= SIMD_T::or_ps(
131 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT
))));
134 gbMult
= SIMD_T::mul_ps(vNegW
,
135 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
136 &state
.gbState
.top
[0], viewportIndexes
));
137 vRes
= SIMD_T::cmplt_ps(vertex
.y
, gbMult
);
138 clipCodes
= SIMD_T::or_ps(
139 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP
))));
142 gbMult
= SIMD_T::mul_ps(vertex
.w
,
143 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
144 &state
.gbState
.right
[0], viewportIndexes
));
145 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, gbMult
);
146 clipCodes
= SIMD_T::or_ps(
147 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT
))));
150 gbMult
= SIMD_T::mul_ps(vertex
.w
,
151 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
152 &state
.gbState
.bottom
[0], viewportIndexes
));
153 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, gbMult
);
154 clipCodes
= SIMD_T::or_ps(
155 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM
))));
158 template <typename SIMD_T
>
164 struct BinnerChooser
<SIMD256
>
166 PFN_PROCESS_PRIMS pfnBinFunc
;
168 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
172 if (numVertsPerPrim
== 3)
174 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
177 else if (numVertsPerPrim
== 2)
179 pfnBinFunc
= BinLines
;
183 SWR_ASSERT(0 && "Unexpected points in clipper.");
187 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
194 pfnBinFunc
= BinPoints
;
199 case TOP_LINE_LIST_ADJ
:
200 case TOP_LISTSTRIP_ADJ
:
201 pfnBinFunc
= BinLines
;
204 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
209 void BinFunc(DRAW_CONTEXT
* pDC
,
212 SIMD256::Vec4 prims
[],
214 SIMD256::Integer
const& primID
,
215 SIMD256::Integer
& viewportIdx
,
216 SIMD256::Integer
& rtIdx
)
218 SWR_ASSERT(pfnBinFunc
!= nullptr);
220 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
224 #if USE_SIMD16_FRONTEND
226 struct BinnerChooser
<SIMD512
>
228 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc
;
230 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
234 if (numVertsPerPrim
== 3)
236 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
239 else if (numVertsPerPrim
== 2)
241 pfnBinFunc
= BinLines_simd16
;
245 SWR_ASSERT(0 && "Unexpected points in clipper.");
249 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
256 pfnBinFunc
= BinPoints_simd16
;
261 case TOP_LINE_LIST_ADJ
:
262 case TOP_LISTSTRIP_ADJ
:
263 pfnBinFunc
= BinLines_simd16
;
266 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
271 void BinFunc(DRAW_CONTEXT
* pDC
,
274 SIMD512::Vec4 prims
[],
276 SIMD512::Integer
const& primID
,
277 SIMD512::Integer
& viewportIdx
,
278 SIMD512::Integer
& rtIdx
)
280 SWR_ASSERT(pfnBinFunc
!= nullptr);
282 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
287 template <typename SIMD_T
>
293 struct SimdHelper
<SIMD256
>
295 static SIMD256::Float
insert_lo_ps(SIMD256::Float a
) { return a
; }
297 static SIMD256::Mask
cmpeq_ps_mask(SIMD256::Float a
, SIMD256::Float b
)
299 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a
, b
));
303 #if USE_SIMD16_FRONTEND
305 struct SimdHelper
<SIMD512
>
307 static SIMD512::Float
insert_lo_ps(SIMD256::Float a
)
309 return SIMD512::insert_ps
<0>(SIMD512::setzero_ps(), a
);
312 static SIMD512::Mask
cmpeq_ps_mask(SIMD512::Float a
, SIMD512::Float b
)
314 return SIMD512::cmp_ps_mask
<SIMD16::CompareType::EQ_OQ
>(a
, b
);
319 // Temp storage used by the clipper
320 template <typename SIMD_T
>
326 struct ClipHelper
<SIMD256
>
328 static SIMDVERTEX_T
<SIMD256
>* GetTempVertices() { return tlsTempVertices
; }
331 #if USE_SIMD16_FRONTEND
333 struct ClipHelper
<SIMD512
>
335 static SIMDVERTEX_T
<SIMD512
>* GetTempVertices() { return tlsTempVertices_simd16
; }
339 template <typename SIMD_T
, uint32_t NumVertsPerPrim
>
343 INLINE
Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
344 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
346 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
349 void ComputeClipCodes(Vec4
<SIMD_T
> vertex
[], const Integer
<SIMD_T
>& viewportIndexes
)
351 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
353 ::ComputeClipCodes
<SIMD_T
>(state
, vertex
[i
], clipCodes
[i
], viewportIndexes
);
357 Float
<SIMD_T
> ComputeClipCodeIntersection()
359 Float
<SIMD_T
> result
= clipCodes
[0];
361 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
363 result
= SIMD_T::and_ps(result
, clipCodes
[i
]);
369 Float
<SIMD_T
> ComputeClipCodeUnion()
371 Float
<SIMD_T
> result
= clipCodes
[0];
373 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
375 result
= SIMD_T::or_ps(result
, clipCodes
[i
]);
381 int ComputeClipMask()
383 Float
<SIMD_T
> clipUnion
= ComputeClipCodeUnion();
386 SIMD_T::and_ps(clipUnion
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK
)));
388 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion
, SIMD_T::setzero_ps()));
391 // clipper is responsible for culling any prims with NAN coordinates
392 int ComputeNaNMask(Vec4
<SIMD_T
> prim
[])
394 Float
<SIMD_T
> vNanMask
= SIMD_T::setzero_ps();
396 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
398 Float
<SIMD_T
> vNan01
=
399 SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[0], prim
[e
].v
[1]);
400 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan01
);
402 Float
<SIMD_T
> vNan23
=
403 SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[2], prim
[e
].v
[3]);
404 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan23
);
407 return SIMD_T::movemask_ps(vNanMask
);
410 int ComputeUserClipCullMask(PA_STATE
& pa
, Vec4
<SIMD_T
> prim
[])
412 uint8_t cullMask
= state
.backendState
.cullDistanceMask
;
413 uint32_t vertexClipCullOffset
= state
.backendState
.vertexClipCullOffset
;
415 Float
<SIMD_T
> vClipCullMask
= SIMD_T::setzero_ps();
417 Vec4
<SIMD_T
> vClipCullDistLo
[3];
418 Vec4
<SIMD_T
> vClipCullDistHi
[3];
420 pa
.Assemble(vertexClipCullOffset
, vClipCullDistLo
);
421 pa
.Assemble(vertexClipCullOffset
+ 1, vClipCullDistHi
);
424 while (_BitScanForward(&index
, cullMask
))
426 cullMask
&= ~(1 << index
);
427 uint32_t slot
= index
>> 2;
428 uint32_t component
= index
& 0x3;
430 Float
<SIMD_T
> vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
431 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
433 Float
<SIMD_T
> vCullComp
;
436 vCullComp
= vClipCullDistLo
[e
][component
];
440 vCullComp
= vClipCullDistHi
[e
][component
];
443 // cull if cull distance < 0 || NAN
444 Float
<SIMD_T
> vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(
445 SIMD_T::setzero_ps(), vCullComp
);
446 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
448 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
451 // clipper should also discard any primitive with NAN clip distance
452 uint8_t clipMask
= state
.backendState
.clipDistanceMask
;
453 while (_BitScanForward(&index
, clipMask
))
455 clipMask
&= ~(1 << index
);
456 uint32_t slot
= index
>> 2;
457 uint32_t component
= index
& 0x3;
459 Float
<SIMD_T
> vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
460 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
462 Float
<SIMD_T
> vClipComp
;
465 vClipComp
= vClipCullDistLo
[e
][component
];
469 vClipComp
= vClipCullDistHi
[e
][component
];
472 Float
<SIMD_T
> vClip
=
473 SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(vClipComp
, vClipComp
);
474 Float
<SIMD_T
> vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(
475 SIMD_T::setzero_ps(), vClipComp
);
476 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
477 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vClip
);
479 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
482 return SIMD_T::movemask_ps(vClipCullMask
);
485 void ClipSimd(const Vec4
<SIMD_T
> prim
[],
486 const Float
<SIMD_T
>& vPrimMask
,
487 const Float
<SIMD_T
>& vClipMask
,
489 const Integer
<SIMD_T
>& vPrimId
,
490 const Integer
<SIMD_T
>& vViewportIdx
,
491 const Integer
<SIMD_T
>& vRtIdx
)
493 // input/output vertex store for clipper
494 SIMDVERTEX_T
<SIMD_T
> vertices
[7]; // maximum 7 verts generated per triangle
496 uint32_t constantInterpMask
= state
.backendState
.constantInterpolationMask
;
497 uint32_t provokingVertex
= 0;
498 if (pa
.binTopology
== TOP_TRIANGLE_FAN
)
500 provokingVertex
= state
.frontendState
.provokingVertex
.triFan
;
502 ///@todo: line topology for wireframe?
505 Vec4
<SIMD_T
> tmpVector
[NumVertsPerPrim
];
506 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
508 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = prim
[i
];
512 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
514 int32_t maxSlot
= -1;
515 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
517 // Compute absolute attrib slot in vertex array
519 backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
520 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
521 uint32_t inputSlot
= backendState
.vertexAttribOffset
+ mapSlot
;
523 pa
.Assemble(inputSlot
, tmpVector
);
525 // if constant interpolation enabled for this attribute, assign the provoking
526 // vertex values to all edges
527 if (CheckBit(constantInterpMask
, slot
))
529 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
531 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
536 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
538 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
543 // assemble user clip distances if enabled
544 uint32_t vertexClipCullSlot
= state
.backendState
.vertexClipCullOffset
;
545 if (state
.backendState
.clipDistanceMask
& 0xf)
547 pa
.Assemble(vertexClipCullSlot
, tmpVector
);
548 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
550 vertices
[i
].attrib
[vertexClipCullSlot
] = tmpVector
[i
];
554 if (state
.backendState
.clipDistanceMask
& 0xf0)
556 pa
.Assemble(vertexClipCullSlot
+ 1, tmpVector
);
557 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
559 vertices
[i
].attrib
[vertexClipCullSlot
+ 1] = tmpVector
[i
];
563 uint32_t numAttribs
= maxSlot
+ 1;
565 Integer
<SIMD_T
> vNumClippedVerts
=
566 ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
568 BinnerChooser
<SIMD_T
> binner(NumVertsPerPrim
,
569 pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
571 // set up new PA for binning clipped primitives
572 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
573 if (NumVertsPerPrim
== 3)
575 clipTopology
= TOP_TRIANGLE_FAN
;
577 // so that the binner knows to bloat wide points later
578 if (pa
.binTopology
== TOP_POINT_LIST
)
580 clipTopology
= TOP_POINT_LIST
;
582 else if (pa
.binTopology
== TOP_RECT_LIST
)
584 clipTopology
= TOP_RECT_LIST
;
587 else if (NumVertsPerPrim
== 2)
589 clipTopology
= TOP_LINE_LIST
;
593 SWR_ASSERT(0 && "Unexpected points in clipper.");
596 const uint32_t* pVertexCount
= reinterpret_cast<const uint32_t*>(&vNumClippedVerts
);
597 const uint32_t* pPrimitiveId
= reinterpret_cast<const uint32_t*>(&vPrimId
);
598 const uint32_t* pViewportIdx
= reinterpret_cast<const uint32_t*>(&vViewportIdx
);
599 const uint32_t* pRtIdx
= reinterpret_cast<const uint32_t*>(&vRtIdx
);
601 const SIMD256::Integer vOffsets
=
602 SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T
<SIMD_T
>), // unused lane
603 6 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
604 5 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
605 4 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
606 3 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
607 2 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
608 1 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
609 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>));
611 // only need to gather 7 verts
612 // @todo dynamic mask based on actual # of verts generated per lane
613 const SIMD256::Float vMask
= SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
615 uint32_t numClippedPrims
= 0;
617 // tranpose clipper output so that each lane's vertices are in SIMD order
618 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
622 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack
623 // overflow in debug builds
624 SIMDVERTEX_T
<SIMD_T
>* transposedPrims
= reinterpret_cast<SIMDVERTEX_T
<SIMD_T
>*>(
625 AlignedMalloc(sizeof(SIMDVERTEX_T
<SIMD_T
>) * 2, 64));
628 SIMDVERTEX_T
<SIMD_T
> transposedPrims
[2];
631 uint32_t numInputPrims
= pa
.NumPrims();
632 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
634 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
635 if (numEmittedVerts
< NumVertsPerPrim
)
639 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
641 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
642 SWR_ASSERT(numEmittedPrims
<= 7, "Unexpected primitive count from clipper.");
644 numClippedPrims
+= numEmittedPrims
;
646 // tranpose clipper output so that each lane's vertices are in SIMD order
647 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
651 uint8_t* pBase
= reinterpret_cast<uint8_t*>(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) +
652 sizeof(float) * inputPrim
;
655 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
656 static const float *dummy
= reinterpret_cast<const float *>(pBase
);
659 for (uint32_t c
= 0; c
< 4; ++c
)
661 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(
662 SIMD256::setzero_ps(), reinterpret_cast<const float*>(pBase
), vOffsets
, vMask
);
663 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] =
664 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
665 pBase
+= sizeof(Float
<SIMD_T
>);
670 reinterpret_cast<uint8_t*>(&vertices
[0].attrib
[backendState
.vertexAttribOffset
]) +
671 sizeof(float) * inputPrim
;
673 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
675 uint32_t attribSlot
= backendState
.vertexAttribOffset
+ attrib
;
677 for (uint32_t c
= 0; c
< 4; ++c
)
679 SIMD256::Float temp
=
680 SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(
681 SIMD256::setzero_ps(),
682 reinterpret_cast<const float*>(pBase
),
685 transposedPrims
[0].attrib
[attribSlot
][c
] =
686 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
687 pBase
+= sizeof(Float
<SIMD_T
>);
691 // transpose user clip distances if enabled
692 uint32_t vertexClipCullSlot
= backendState
.vertexClipCullOffset
;
693 if (state
.backendState
.clipDistanceMask
& 0x0f)
695 pBase
= reinterpret_cast<uint8_t*>(&vertices
[0].attrib
[vertexClipCullSlot
]) +
696 sizeof(float) * inputPrim
;
698 for (uint32_t c
= 0; c
< 4; ++c
)
700 SIMD256::Float temp
=
701 SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(
702 SIMD256::setzero_ps(),
703 reinterpret_cast<const float*>(pBase
),
706 transposedPrims
[0].attrib
[vertexClipCullSlot
][c
] =
707 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
708 pBase
+= sizeof(Float
<SIMD_T
>);
712 if (state
.backendState
.clipDistanceMask
& 0xf0)
714 pBase
= reinterpret_cast<uint8_t*>(&vertices
[0].attrib
[vertexClipCullSlot
+ 1]) +
715 sizeof(float) * inputPrim
;
717 for (uint32_t c
= 0; c
< 4; ++c
)
719 SIMD256::Float temp
=
720 SIMD256::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(
721 SIMD256::setzero_ps(),
722 reinterpret_cast<const float*>(pBase
),
725 transposedPrims
[0].attrib
[vertexClipCullSlot
+ 1][c
] =
726 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
727 pBase
+= sizeof(Float
<SIMD_T
>);
731 PA_STATE_OPT
clipPA(pDC
,
733 reinterpret_cast<uint8_t*>(&transposedPrims
[0]),
739 clipPA
.viewportArrayActive
= pa
.viewportArrayActive
;
740 clipPA
.rtArrayActive
= pa
.rtArrayActive
;
742 static const uint32_t primMaskMap
[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
744 const uint32_t primMask
= primMaskMap
[numEmittedPrims
];
746 const Integer
<SIMD_T
> primID
= SIMD_T::set1_epi32(pPrimitiveId
[inputPrim
]);
747 const Integer
<SIMD_T
> viewportIdx
= SIMD_T::set1_epi32(pViewportIdx
[inputPrim
]);
748 const Integer
<SIMD_T
> rtIdx
= SIMD_T::set1_epi32(pRtIdx
[inputPrim
]);
750 while (clipPA
.GetNextStreamOutput())
754 Vec4
<SIMD_T
> attrib
[NumVertsPerPrim
];
756 bool assemble
= clipPA
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
761 pDC
, clipPA
, workerId
, attrib
, primMask
, primID
, viewportIdx
, rtIdx
);
764 } while (clipPA
.NextPrim());
769 AlignedFree(transposedPrims
);
772 // update global pipeline stat
773 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
776 void ExecuteStage(PA_STATE
& pa
,
779 Integer
<SIMD_T
> const& primId
,
780 Integer
<SIMD_T
> const& viewportIdx
,
781 Integer
<SIMD_T
> const& rtIdx
)
783 SWR_ASSERT(pa
.pDC
!= nullptr);
785 BinnerChooser
<SIMD_T
> binner(pa
.binTopology
,
786 pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
788 // update clipper invocations pipeline stat
789 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
790 UPDATE_STAT_FE(CInvocations
, numInvoc
);
792 ComputeClipCodes(prim
, viewportIdx
);
794 // cull prims with NAN coords
795 primMask
&= ~ComputeNaNMask(prim
);
797 // user cull distance cull
798 if (state
.backendState
.cullDistanceMask
| state
.backendState
.clipDistanceMask
)
800 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
803 Float
<SIMD_T
> clipIntersection
= ComputeClipCodeIntersection();
804 // Mask out non-frustum codes
805 clipIntersection
= SIMD_T::and_ps(clipIntersection
,
806 SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK
)));
808 // cull prims outside view frustum
810 primMask
& SimdHelper
<SIMD_T
>::cmpeq_ps_mask(clipIntersection
, SIMD_T::setzero_ps());
812 // skip clipping for points
813 uint32_t clipMask
= 0;
814 if (NumVertsPerPrim
!= 1)
816 clipMask
= validMask
& ComputeClipMask();
819 AR_EVENT(ClipInfoEvent(numInvoc
, validMask
, clipMask
));
823 RDTSC_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
824 // we have to clip tris, execute the clipper, which will also
827 SIMD_T::vmask_ps(validMask
),
828 SIMD_T::vmask_ps(clipMask
),
833 RDTSC_END(FEGuardbandClip
, 1);
837 // update CPrimitives pipeline state
838 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
840 // forward valid prims directly to binner
842 this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
, viewportIdx
, rtIdx
);
847 Float
<SIMD_T
> ComputeInterpFactor(Float
<SIMD_T
> const& boundaryCoord0
,
848 Float
<SIMD_T
> const& boundaryCoord1
)
850 return SIMD_T::div_ps(boundaryCoord0
, SIMD_T::sub_ps(boundaryCoord0
, boundaryCoord1
));
854 ComputeOffsets(uint32_t attrib
, Integer
<SIMD_T
> const& vIndices
, uint32_t component
)
856 const uint32_t simdVertexStride
= sizeof(SIMDVERTEX_T
<SIMD_T
>);
857 const uint32_t componentStride
= sizeof(Float
<SIMD_T
>);
858 const uint32_t attribStride
= sizeof(Vec4
<SIMD_T
>);
860 static const OSALIGNSIMD16(uint32_t) elemOffset
[16] = {
879 static_assert(sizeof(Integer
<SIMD_T
>) <= sizeof(elemOffset
),
880 "Clipper::ComputeOffsets, Increase number of element offsets.");
882 Integer
<SIMD_T
> vElemOffset
=
883 SIMD_T::loadu_si(reinterpret_cast<const Integer
<SIMD_T
>*>(elemOffset
));
885 // step to the simdvertex
886 Integer
<SIMD_T
> vOffsets
=
887 SIMD_T::mullo_epi32(vIndices
, SIMD_T::set1_epi32(simdVertexStride
));
889 // step to the attribute and component
890 vOffsets
= SIMD_T::add_epi32(
891 vOffsets
, SIMD_T::set1_epi32(attribStride
* attrib
+ componentStride
* component
));
894 vOffsets
= SIMD_T::add_epi32(vOffsets
, vElemOffset
);
899 Float
<SIMD_T
> GatherComponent(const float* pBuffer
,
901 Float
<SIMD_T
> const& vMask
,
902 Integer
<SIMD_T
> const& vIndices
,
905 Integer
<SIMD_T
> vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
906 Float
<SIMD_T
> vSrc
= SIMD_T::setzero_ps();
908 return SIMD_T::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(
909 vSrc
, pBuffer
, vOffsets
, vMask
);
912 void ScatterComponent(const float* pBuffer
,
914 Float
<SIMD_T
> const& vMask
,
915 Integer
<SIMD_T
> const& vIndices
,
917 Float
<SIMD_T
> const& vSrc
)
919 Integer
<SIMD_T
> vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
921 const uint32_t* pOffsets
= reinterpret_cast<const uint32_t*>(&vOffsets
);
922 const float* pSrc
= reinterpret_cast<const float*>(&vSrc
);
923 uint32_t mask
= SIMD_T::movemask_ps(vMask
);
925 while (_BitScanForward(&lane
, mask
))
927 mask
&= ~(1 << lane
);
928 const uint8_t* pBuf
= reinterpret_cast<const uint8_t*>(pBuffer
) + pOffsets
[lane
];
929 *(float*)pBuf
= pSrc
[lane
];
933 template <SWR_CLIPCODES ClippingPlane
>
934 void intersect(const Float
<SIMD_T
>& vActiveMask
, // active lanes to operate on
935 const Integer
<SIMD_T
>& s
, // index to first edge vertex v0 in pInPts.
936 const Integer
<SIMD_T
>& p
, // index to second edge vertex v1 in pInPts.
937 const Vec4
<SIMD_T
>& v1
, // vertex 0 position
938 const Vec4
<SIMD_T
>& v2
, // vertex 1 position
939 Integer
<SIMD_T
>& outIndex
, // output index.
940 const float* pInVerts
, // array of all the input positions.
941 uint32_t numInAttribs
, // number of attributes per vertex.
942 float* pOutVerts
) // array of output positions. We'll write our new intersection
945 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
946 uint32_t vertexClipCullOffset
= this->state
.backendState
.vertexClipCullOffset
;
948 // compute interpolation factor
950 switch (ClippingPlane
)
953 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[0]), SIMD_T::add_ps(v2
[3], v2
[0]));
956 t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[0]), SIMD_T::sub_ps(v2
[3], v2
[0]));
959 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[1]), SIMD_T::add_ps(v2
[3], v2
[1]));
962 t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[1]), SIMD_T::sub_ps(v2
[3], v2
[1]));
965 // DX Znear plane is 0, GL is -w
966 if (this->state
.rastState
.clipHalfZ
)
968 t
= ComputeInterpFactor(v1
[2], v2
[2]);
972 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[2]), SIMD_T::add_ps(v2
[3], v2
[2]));
976 t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[2]), SIMD_T::sub_ps(v2
[3], v2
[2]));
979 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
982 // interpolate position and store
983 for (uint32_t c
= 0; c
< 4; ++c
)
985 Float
<SIMD_T
> vOutPos
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
986 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
989 // interpolate attributes and store
990 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
992 uint32_t attribSlot
= vertexAttribOffset
+ a
;
993 for (uint32_t c
= 0; c
< 4; ++c
)
995 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
996 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
997 Float
<SIMD_T
> vOutAttrib
=
998 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
999 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1003 // interpolate clip distance if enabled
1004 if (this->state
.backendState
.clipDistanceMask
& 0xf)
1006 uint32_t attribSlot
= vertexClipCullOffset
;
1007 for (uint32_t c
= 0; c
< 4; ++c
)
1009 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1010 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1011 Float
<SIMD_T
> vOutAttrib
=
1012 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1013 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1017 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
1019 uint32_t attribSlot
= vertexClipCullOffset
+ 1;
1020 for (uint32_t c
= 0; c
< 4; ++c
)
1022 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1023 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1024 Float
<SIMD_T
> vOutAttrib
=
1025 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1026 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1031 template <SWR_CLIPCODES ClippingPlane
>
1032 Float
<SIMD_T
> inside(const Vec4
<SIMD_T
>& v
)
1034 switch (ClippingPlane
)
1037 return SIMD_T::cmpge_ps(v
[0], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
1039 return SIMD_T::cmple_ps(v
[0], v
[3]);
1041 return SIMD_T::cmpge_ps(v
[1], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
1042 case FRUSTUM_BOTTOM
:
1043 return SIMD_T::cmple_ps(v
[1], v
[3]);
1045 return SIMD_T::cmpge_ps(v
[2],
1046 this->state
.rastState
.clipHalfZ
1047 ? SIMD_T::setzero_ps()
1048 : SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
1050 return SIMD_T::cmple_ps(v
[2], v
[3]);
1052 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
1053 return SIMD_T::setzero_ps();
1057 template <SWR_CLIPCODES ClippingPlane
>
1058 Integer
<SIMD_T
> ClipTriToPlane(const float* pInVerts
,
1059 const Integer
<SIMD_T
>& vNumInPts
,
1060 uint32_t numInAttribs
,
1063 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1065 Integer
<SIMD_T
> vCurIndex
= SIMD_T::setzero_si();
1066 Integer
<SIMD_T
> vOutIndex
= SIMD_T::setzero_si();
1067 Float
<SIMD_T
> vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1069 while (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
1071 Integer
<SIMD_T
> s
= vCurIndex
;
1072 Integer
<SIMD_T
> p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
1073 Integer
<SIMD_T
> underFlowMask
= SIMD_T::cmpgt_epi32(vNumInPts
, p
);
1074 p
= SIMD_T::castps_si(SIMD_T::blendv_ps(
1075 SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p
), SIMD_T::castsi_ps(underFlowMask
)));
1078 Vec4
<SIMD_T
> vInPos0
, vInPos1
;
1079 for (uint32_t c
= 0; c
< 4; ++c
)
1081 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1082 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1085 // compute inside mask
1086 Float
<SIMD_T
> s_in
= inside
<ClippingPlane
>(vInPos0
);
1087 Float
<SIMD_T
> p_in
= inside
<ClippingPlane
>(vInPos1
);
1089 // compute intersection mask (s_in != p_in)
1090 Float
<SIMD_T
> intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
1091 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
1093 // store s if inside
1094 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
1095 if (!SIMD_T::testz_ps(s_in
, s_in
))
1098 for (uint32_t c
= 0; c
< 4; ++c
)
1101 pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1105 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1107 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1108 for (uint32_t c
= 0; c
< 4; ++c
)
1110 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1111 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1115 // store clip distance if enabled
1116 uint32_t vertexClipCullSlot
= this->state
.backendState
.vertexClipCullOffset
;
1117 if (this->state
.backendState
.clipDistanceMask
& 0xf)
1119 uint32_t attribSlot
= vertexClipCullSlot
;
1120 for (uint32_t c
= 0; c
< 4; ++c
)
1122 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1123 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1127 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
1129 uint32_t attribSlot
= vertexClipCullSlot
+ 1;
1130 for (uint32_t c
= 0; c
< 4; ++c
)
1132 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1133 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1137 // increment outIndex
1138 vOutIndex
= SIMD_T::blendv_epi32(
1139 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1142 // compute and store intersection
1143 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1145 intersect
<ClippingPlane
>(intersectMask
,
1155 // increment outIndex for active lanes
1156 vOutIndex
= SIMD_T::blendv_epi32(
1157 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1160 // increment loop index and update active mask
1161 vCurIndex
= SIMD_T::add_epi32(vCurIndex
, SIMD_T::set1_epi32(1));
1162 vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1168 template <SWR_CLIPCODES ClippingPlane
>
1169 Integer
<SIMD_T
> ClipLineToPlane(const float* pInVerts
,
1170 const Integer
<SIMD_T
>& vNumInPts
,
1171 uint32_t numInAttribs
,
1174 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1176 Integer
<SIMD_T
> vCurIndex
= SIMD_T::setzero_si();
1177 Integer
<SIMD_T
> vOutIndex
= SIMD_T::setzero_si();
1178 Float
<SIMD_T
> vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1180 if (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
))
1182 Integer
<SIMD_T
> s
= vCurIndex
;
1183 Integer
<SIMD_T
> p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
1186 Vec4
<SIMD_T
> vInPos0
, vInPos1
;
1187 for (uint32_t c
= 0; c
< 4; ++c
)
1189 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1190 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1193 // compute inside mask
1194 Float
<SIMD_T
> s_in
= inside
<ClippingPlane
>(vInPos0
);
1195 Float
<SIMD_T
> p_in
= inside
<ClippingPlane
>(vInPos1
);
1197 // compute intersection mask (s_in != p_in)
1198 Float
<SIMD_T
> intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
1199 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
1201 // store s if inside
1202 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
1203 if (!SIMD_T::testz_ps(s_in
, s_in
))
1205 for (uint32_t c
= 0; c
< 4; ++c
)
1208 pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1211 // interpolate attributes and store
1212 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1214 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1215 for (uint32_t c
= 0; c
< 4; ++c
)
1217 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1218 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1222 // increment outIndex
1223 vOutIndex
= SIMD_T::blendv_epi32(
1224 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1227 // compute and store intersection
1228 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1230 intersect
<ClippingPlane
>(intersectMask
,
1240 // increment outIndex for active lanes
1241 vOutIndex
= SIMD_T::blendv_epi32(
1242 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1245 // store p if inside
1246 p_in
= SIMD_T::and_ps(p_in
, vActiveMask
);
1247 if (!SIMD_T::testz_ps(p_in
, p_in
))
1249 for (uint32_t c
= 0; c
< 4; ++c
)
1252 pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
1255 // interpolate attributes and store
1256 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1258 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1259 for (uint32_t c
= 0; c
< 4; ++c
)
1261 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
1262 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
1266 // increment outIndex
1267 vOutIndex
= SIMD_T::blendv_epi32(
1268 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), p_in
);
1275 Integer
<SIMD_T
> ClipPrims(float* pVertices
,
1276 const Float
<SIMD_T
>& vPrimMask
,
1277 const Float
<SIMD_T
>& vClipMask
,
1281 float* pTempVerts
= reinterpret_cast<float*>(ClipHelper
<SIMD_T
>::GetTempVertices());
1283 // zero out num input verts for non-active lanes
1284 Integer
<SIMD_T
> vNumInPts
= SIMD_T::set1_epi32(NumVertsPerPrim
);
1285 vNumInPts
= SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts
, vClipMask
);
1287 // clip prims to frustum
1288 Integer
<SIMD_T
> vNumOutPts
;
1289 if (NumVertsPerPrim
== 3)
1291 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1292 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1294 ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1296 ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1298 ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1299 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1303 SWR_ASSERT(NumVertsPerPrim
== 2);
1305 ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1307 ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1309 ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1311 ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1313 ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1315 ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1318 // restore num verts for non-clipped, active lanes
1319 Float
<SIMD_T
> vNonClippedMask
= SIMD_T::andnot_ps(vClipMask
, vPrimMask
);
1321 SIMD_T::blendv_epi32(vNumOutPts
, SIMD_T::set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
1326 const uint32_t workerId
{0};
1327 DRAW_CONTEXT
* pDC
{nullptr};
1328 const API_STATE
& state
;
1329 Float
<SIMD_T
> clipCodes
[NumVertsPerPrim
];
1332 // pipeline stage functions
1333 void ClipRectangles(DRAW_CONTEXT
* pDC
,
1338 simdscalari
const& primId
,
1339 simdscalari
const& viewportIdx
,
1340 simdscalari
const& rtIdx
);
1341 void ClipTriangles(DRAW_CONTEXT
* pDC
,
1346 simdscalari
const& primId
,
1347 simdscalari
const& viewportIdx
,
1348 simdscalari
const& rtIdx
);
1349 void ClipLines(DRAW_CONTEXT
* pDC
,
1354 simdscalari
const& primId
,
1355 simdscalari
const& viewportIdx
,
1356 simdscalari
const& rtIdx
);
1357 void ClipPoints(DRAW_CONTEXT
* pDC
,
1362 simdscalari
const& primId
,
1363 simdscalari
const& viewportIdx
,
1364 simdscalari
const& rtIdx
);
1365 #if USE_SIMD16_FRONTEND
1366 void SIMDCALL
ClipRectangles_simd16(DRAW_CONTEXT
* pDC
,
1369 simd16vector prims
[],
1371 simd16scalari
const& primId
,
1372 simd16scalari
const& viewportIdx
,
1373 simd16scalari
const& rtIdx
);
1374 void SIMDCALL
ClipTriangles_simd16(DRAW_CONTEXT
* pDC
,
1377 simd16vector prims
[],
1379 simd16scalari
const& primId
,
1380 simd16scalari
const& viewportIdx
,
1381 simd16scalari
const& rtIdx
);
1382 void SIMDCALL
ClipLines_simd16(DRAW_CONTEXT
* pDC
,
1385 simd16vector prims
[],
1387 simd16scalari
const& primId
,
1388 simd16scalari
const& viewportIdx
,
1389 simd16scalari
const& rtIdx
);
1390 void SIMDCALL
ClipPoints_simd16(DRAW_CONTEXT
* pDC
,
1393 simd16vector prims
[],
1395 simd16scalari
const& primId
,
1396 simd16scalari
const& viewportIdx
,
1397 simd16scalari
const& rtIdx
);