1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
37 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
38 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
39 // rather than intersection, of clipcodes.
40 #define CLIPCODE_SHIFT 23
41 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
42 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
43 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
44 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
46 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
47 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
49 NEGW
= (0x40 << CLIPCODE_SHIFT
),
51 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
52 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
53 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
54 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
57 #define GUARDBAND_CLIP_MASK \
58 (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
59 GUARDBAND_BOTTOM | NEGW)
60 #define FRUSTUM_CLIP_MASK \
61 (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
63 template <typename SIMD_T
>
64 void ComputeClipCodes(const API_STATE
& state
,
65 const Vec4
<SIMD_T
>& vertex
,
66 Float
<SIMD_T
>& clipCodes
,
67 Integer
<SIMD_T
> const& viewportIndexes
)
69 clipCodes
= SIMD_T::setzero_ps();
72 Float
<SIMD_T
> vNegW
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::set1_ps(-1.0f
));
75 Float
<SIMD_T
> vRes
= SIMD_T::cmplt_ps(vertex
.x
, vNegW
);
76 clipCodes
= SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT
)));
79 vRes
= SIMD_T::cmplt_ps(vertex
.y
, vNegW
);
80 clipCodes
= SIMD_T::or_ps(
81 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP
))));
84 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, vertex
.w
);
85 clipCodes
= SIMD_T::or_ps(
86 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT
))));
89 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, vertex
.w
);
90 clipCodes
= SIMD_T::or_ps(
91 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM
))));
93 if (state
.rastState
.depthClipEnable
)
96 // DX clips depth [0..w], GL clips [-w..w]
97 if (state
.rastState
.clipHalfZ
)
99 vRes
= SIMD_T::cmplt_ps(vertex
.z
, SIMD_T::setzero_ps());
103 vRes
= SIMD_T::cmplt_ps(vertex
.z
, vNegW
);
105 clipCodes
= SIMD_T::or_ps(
106 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR
))));
109 vRes
= SIMD_T::cmpgt_ps(vertex
.z
, vertex
.w
);
110 clipCodes
= SIMD_T::or_ps(
111 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR
))));
115 vRes
= SIMD_T::cmple_ps(vertex
.w
, SIMD_T::setzero_ps());
117 SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW
))));
120 Float
<SIMD_T
> gbMult
= SIMD_T::mul_ps(vNegW
,
121 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
122 &state
.gbState
.left
[0], viewportIndexes
));
123 vRes
= SIMD_T::cmplt_ps(vertex
.x
, gbMult
);
124 clipCodes
= SIMD_T::or_ps(
125 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT
))));
128 gbMult
= SIMD_T::mul_ps(vNegW
,
129 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
130 &state
.gbState
.top
[0], viewportIndexes
));
131 vRes
= SIMD_T::cmplt_ps(vertex
.y
, gbMult
);
132 clipCodes
= SIMD_T::or_ps(
133 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP
))));
136 gbMult
= SIMD_T::mul_ps(vertex
.w
,
137 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
138 &state
.gbState
.right
[0], viewportIndexes
));
139 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, gbMult
);
140 clipCodes
= SIMD_T::or_ps(
141 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT
))));
144 gbMult
= SIMD_T::mul_ps(vertex
.w
,
145 SIMD_T::template i32gather_ps
<ScaleFactor
<SIMD_T
>(4)>(
146 &state
.gbState
.bottom
[0], viewportIndexes
));
147 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, gbMult
);
148 clipCodes
= SIMD_T::or_ps(
149 clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM
))));
152 template <typename SIMD_T
>
158 struct BinnerChooser
<SIMD256
>
160 PFN_PROCESS_PRIMS pfnBinFunc
;
162 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
166 if (numVertsPerPrim
== 3)
168 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
171 else if (numVertsPerPrim
== 2)
173 pfnBinFunc
= BinLines
;
177 SWR_ASSERT(0 && "Unexpected points in clipper.");
181 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
188 pfnBinFunc
= BinPoints
;
193 case TOP_LINE_LIST_ADJ
:
194 case TOP_LISTSTRIP_ADJ
:
195 pfnBinFunc
= BinLines
;
198 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
203 void BinFunc(DRAW_CONTEXT
* pDC
,
206 SIMD256::Vec4 prims
[],
208 SIMD256::Integer
const& primID
,
209 SIMD256::Integer
& viewportIdx
,
210 SIMD256::Integer
& rtIdx
)
212 SWR_ASSERT(pfnBinFunc
!= nullptr);
214 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
218 #if USE_SIMD16_FRONTEND
220 struct BinnerChooser
<SIMD512
>
222 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc
;
224 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
228 if (numVertsPerPrim
== 3)
230 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
233 else if (numVertsPerPrim
== 2)
235 pfnBinFunc
= BinLines_simd16
;
239 SWR_ASSERT(0 && "Unexpected points in clipper.");
243 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
250 pfnBinFunc
= BinPoints_simd16
;
255 case TOP_LINE_LIST_ADJ
:
256 case TOP_LISTSTRIP_ADJ
:
257 pfnBinFunc
= BinLines_simd16
;
260 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
265 void BinFunc(DRAW_CONTEXT
* pDC
,
268 SIMD512::Vec4 prims
[],
270 SIMD512::Integer
const& primID
,
271 SIMD512::Integer
& viewportIdx
,
272 SIMD512::Integer
& rtIdx
)
274 SWR_ASSERT(pfnBinFunc
!= nullptr);
276 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
, viewportIdx
, rtIdx
);
281 template <typename SIMD_T
>
287 struct SimdHelper
<SIMD256
>
289 static SIMD256::Float
insert_lo_ps(SIMD256::Float a
) { return a
; }
291 static SIMD256::Mask
cmpeq_ps_mask(SIMD256::Float a
, SIMD256::Float b
)
293 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a
, b
));
297 #if USE_SIMD16_FRONTEND
299 struct SimdHelper
<SIMD512
>
301 static SIMD512::Float
insert_lo_ps(SIMD256::Float a
)
303 return SIMD512::insert_ps
<0>(SIMD512::setzero_ps(), a
);
306 static SIMD512::Mask
cmpeq_ps_mask(SIMD512::Float a
, SIMD512::Float b
)
308 return SIMD512::cmp_ps_mask
<SIMD16::CompareType::EQ_OQ
>(a
, b
);
313 template <typename SIMD_T
, uint32_t NumVertsPerPrimT
>
317 INLINE
Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
318 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
320 static_assert(NumVertsPerPrimT
>= 1 && NumVertsPerPrimT
<= 3, "Invalid NumVertsPerPrim");
321 THREAD_DATA
&thread_data
= in_pDC
->pContext
->threadPool
.pThreadData
[workerId
];
323 if (thread_data
.clipperData
== nullptr)
325 // 7 vertex temp data
326 // 7 post-clipped vertices
327 // 2 transposed verts for binning
328 size_t alloc_size
= sizeof(SIMDVERTEX_T
<SIMD_T
>) * (7 + 7 + 2);
329 thread_data
.clipperData
= AlignedMalloc(alloc_size
, KNOB_SIMD16_BYTES
);
331 SWR_ASSERT(thread_data
.clipperData
);
333 this->clippedVerts
= (SIMDVERTEX_T
<SIMD_T
>*)thread_data
.clipperData
;
334 this->tmpVerts
= this->clippedVerts
+ 7;
335 this->transposedVerts
= this->tmpVerts
+ 7;
338 void ComputeClipCodes(Vec4
<SIMD_T
> vertex
[], const Integer
<SIMD_T
>& viewportIndexes
)
340 for (uint32_t i
= 0; i
< NumVertsPerPrimT
; ++i
)
342 ::ComputeClipCodes
<SIMD_T
>(state
, vertex
[i
], clipCodes
[i
], viewportIndexes
);
346 Float
<SIMD_T
> ComputeClipCodeIntersection()
348 Float
<SIMD_T
> result
= clipCodes
[0];
350 for (uint32_t i
= 1; i
< NumVertsPerPrimT
; ++i
)
352 result
= SIMD_T::and_ps(result
, clipCodes
[i
]);
358 Float
<SIMD_T
> ComputeClipCodeUnion()
360 Float
<SIMD_T
> result
= clipCodes
[0];
362 for (uint32_t i
= 1; i
< NumVertsPerPrimT
; ++i
)
364 result
= SIMD_T::or_ps(result
, clipCodes
[i
]);
370 int ComputeClipMask()
372 Float
<SIMD_T
> clipUnion
= ComputeClipCodeUnion();
375 SIMD_T::and_ps(clipUnion
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK
)));
377 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion
, SIMD_T::setzero_ps()));
380 // clipper is responsible for culling any prims with NAN coordinates
381 int ComputeNaNMask(Vec4
<SIMD_T
> prim
[])
383 Float
<SIMD_T
> vNanMask
= SIMD_T::setzero_ps();
385 for (uint32_t e
= 0; e
< NumVertsPerPrimT
; ++e
)
387 Float
<SIMD_T
> vNan01
=
388 SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[0], prim
[e
].v
[1]);
389 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan01
);
391 Float
<SIMD_T
> vNan23
=
392 SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[2], prim
[e
].v
[3]);
393 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan23
);
396 return SIMD_T::movemask_ps(vNanMask
);
399 int ComputeUserClipCullMask(PA_STATE
& pa
, Vec4
<SIMD_T
> prim
[])
401 uint8_t cullMask
= state
.backendState
.cullDistanceMask
;
402 uint32_t vertexClipCullOffset
= state
.backendState
.vertexClipCullOffset
;
404 Float
<SIMD_T
> vClipCullMask
= SIMD_T::setzero_ps();
406 Vec4
<SIMD_T
> vClipCullDistLo
[3];
407 Vec4
<SIMD_T
> vClipCullDistHi
[3];
409 pa
.Assemble(vertexClipCullOffset
, vClipCullDistLo
);
410 pa
.Assemble(vertexClipCullOffset
+ 1, vClipCullDistHi
);
413 while (_BitScanForward(&index
, cullMask
))
415 cullMask
&= ~(1 << index
);
416 uint32_t slot
= index
>> 2;
417 uint32_t component
= index
& 0x3;
419 Float
<SIMD_T
> vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
420 for (uint32_t e
= 0; e
< NumVertsPerPrimT
; ++e
)
422 Float
<SIMD_T
> vCullComp
;
425 vCullComp
= vClipCullDistLo
[e
][component
];
429 vCullComp
= vClipCullDistHi
[e
][component
];
432 // cull if cull distance < 0 || NAN
433 Float
<SIMD_T
> vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(
434 SIMD_T::setzero_ps(), vCullComp
);
435 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
437 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
440 // clipper should also discard any primitive with NAN clip distance
441 uint8_t clipMask
= state
.backendState
.clipDistanceMask
;
442 while (_BitScanForward(&index
, clipMask
))
444 clipMask
&= ~(1 << index
);
445 uint32_t slot
= index
>> 2;
446 uint32_t component
= index
& 0x3;
448 Float
<SIMD_T
> vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
449 for (uint32_t e
= 0; e
< NumVertsPerPrimT
; ++e
)
451 Float
<SIMD_T
> vClipComp
;
454 vClipComp
= vClipCullDistLo
[e
][component
];
458 vClipComp
= vClipCullDistHi
[e
][component
];
461 Float
<SIMD_T
> vClip
=
462 SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(vClipComp
, vClipComp
);
463 Float
<SIMD_T
> vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(
464 SIMD_T::setzero_ps(), vClipComp
);
465 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
466 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vClip
);
468 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
471 return SIMD_T::movemask_ps(vClipCullMask
);
474 void ClipSimd(const Vec4
<SIMD_T
> prim
[],
475 const Float
<SIMD_T
>& vPrimMask
,
476 const Float
<SIMD_T
>& vClipMask
,
478 const Integer
<SIMD_T
>& vPrimId
,
479 const Integer
<SIMD_T
>& vViewportIdx
,
480 const Integer
<SIMD_T
>& vRtIdx
)
482 // input/output vertex store for clipper
483 SIMDVERTEX_T
<SIMD_T
>* vertices
= this->clippedVerts
;
485 uint32_t constantInterpMask
= state
.backendState
.constantInterpolationMask
;
486 uint32_t provokingVertex
= 0;
487 if (pa
.binTopology
== TOP_TRIANGLE_FAN
)
489 provokingVertex
= state
.frontendState
.provokingVertex
.triFan
;
491 ///@todo: line topology for wireframe?
494 Vec4
<SIMD_T
> tmpVector
[NumVertsPerPrimT
];
495 for (uint32_t i
= 0; i
< NumVertsPerPrimT
; ++i
)
497 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = prim
[i
];
501 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
503 int32_t maxSlot
= -1;
504 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
506 // Compute absolute attrib slot in vertex array
508 backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
509 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
510 uint32_t inputSlot
= backendState
.vertexAttribOffset
+ mapSlot
;
512 pa
.Assemble(inputSlot
, tmpVector
);
514 // if constant interpolation enabled for this attribute, assign the provoking
515 // vertex values to all edges
516 if (CheckBit(constantInterpMask
, slot
))
518 for (uint32_t i
= 0; i
< NumVertsPerPrimT
; ++i
)
520 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
525 for (uint32_t i
= 0; i
< NumVertsPerPrimT
; ++i
)
527 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
532 // assemble user clip distances if enabled
533 uint32_t vertexClipCullSlot
= state
.backendState
.vertexClipCullOffset
;
534 if (state
.backendState
.clipDistanceMask
& 0xf)
536 pa
.Assemble(vertexClipCullSlot
, tmpVector
);
537 for (uint32_t i
= 0; i
< NumVertsPerPrimT
; ++i
)
539 vertices
[i
].attrib
[vertexClipCullSlot
] = tmpVector
[i
];
543 if (state
.backendState
.clipDistanceMask
& 0xf0)
545 pa
.Assemble(vertexClipCullSlot
+ 1, tmpVector
);
546 for (uint32_t i
= 0; i
< NumVertsPerPrimT
; ++i
)
548 vertices
[i
].attrib
[vertexClipCullSlot
+ 1] = tmpVector
[i
];
552 uint32_t numAttribs
= maxSlot
+ 1;
554 Integer
<SIMD_T
> vNumClippedVerts
=
555 ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
557 BinnerChooser
<SIMD_T
> binner(NumVertsPerPrimT
,
558 pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
560 // set up new PA for binning clipped primitives
561 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
562 if (NumVertsPerPrimT
== 3)
564 clipTopology
= TOP_TRIANGLE_FAN
;
566 // so that the binner knows to bloat wide points later
567 if (pa
.binTopology
== TOP_POINT_LIST
)
569 clipTopology
= TOP_POINT_LIST
;
571 else if (pa
.binTopology
== TOP_RECT_LIST
)
573 clipTopology
= TOP_RECT_LIST
;
576 else if (NumVertsPerPrimT
== 2)
578 clipTopology
= TOP_LINE_LIST
;
582 SWR_ASSERT(0 && "Unexpected points in clipper.");
585 const uint32_t* pVertexCount
= reinterpret_cast<const uint32_t*>(&vNumClippedVerts
);
586 const uint32_t* pPrimitiveId
= reinterpret_cast<const uint32_t*>(&vPrimId
);
587 const uint32_t* pViewportIdx
= reinterpret_cast<const uint32_t*>(&vViewportIdx
);
588 const uint32_t* pRtIdx
= reinterpret_cast<const uint32_t*>(&vRtIdx
);
590 const SIMD256::Integer vOffsets
=
591 SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T
<SIMD_T
>), // unused lane
592 6 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
593 5 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
594 4 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
595 3 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
596 2 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
597 1 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
598 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>));
600 // only need to gather 7 verts
601 // @todo dynamic mask based on actual # of verts generated per lane
602 const SIMD256::Float vMask
= SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
604 uint32_t numClippedPrims
= 0;
606 // transpose clipper output so that each lane's vertices are in SIMD order
607 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
609 SIMDVERTEX_T
<SIMD_T
>* transposedPrims
= this->transposedVerts
;
611 uint32_t numInputPrims
= pa
.NumPrims();
612 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
614 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
615 if (numEmittedVerts
< NumVertsPerPrimT
)
619 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
621 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
622 SWR_ASSERT(numEmittedPrims
<= 7, "Unexpected primitive count from clipper.");
624 numClippedPrims
+= numEmittedPrims
;
626 // tranpose clipper output so that each lane's vertices are in SIMD order
627 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
632 reinterpret_cast<float const*>(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) +
635 for (uint32_t c
= 0; c
< 4; ++c
)
637 SIMD256::Float temp
=
638 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase
, vOffsets
, vMask
);
639 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] =
640 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
641 pBase
= PtrAdd(pBase
, sizeof(Float
<SIMD_T
>));
645 pBase
= reinterpret_cast<float const*>(
646 &vertices
[0].attrib
[backendState
.vertexAttribOffset
]) +
649 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
651 uint32_t attribSlot
= backendState
.vertexAttribOffset
+ attrib
;
653 for (uint32_t c
= 0; c
< 4; ++c
)
655 SIMD256::Float temp
=
656 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase
, vOffsets
, vMask
);
657 transposedPrims
[0].attrib
[attribSlot
][c
] =
658 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
659 pBase
= PtrAdd(pBase
, sizeof(Float
<SIMD_T
>));
663 // transpose user clip distances if enabled
664 uint32_t vertexClipCullSlot
= backendState
.vertexClipCullOffset
;
665 if (state
.backendState
.clipDistanceMask
& 0x0f)
667 pBase
= reinterpret_cast<float const*>(&vertices
[0].attrib
[vertexClipCullSlot
]) +
670 for (uint32_t c
= 0; c
< 4; ++c
)
672 SIMD256::Float temp
=
673 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase
, vOffsets
, vMask
);
674 transposedPrims
[0].attrib
[vertexClipCullSlot
][c
] =
675 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
676 pBase
= PtrAdd(pBase
, sizeof(Float
<SIMD_T
>));
680 if (state
.backendState
.clipDistanceMask
& 0xf0)
683 reinterpret_cast<float const*>(&vertices
[0].attrib
[vertexClipCullSlot
+ 1]) +
686 for (uint32_t c
= 0; c
< 4; ++c
)
688 SIMD256::Float temp
=
689 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase
, vOffsets
, vMask
);
690 transposedPrims
[0].attrib
[vertexClipCullSlot
+ 1][c
] =
691 SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
692 pBase
= PtrAdd(pBase
, sizeof(Float
<SIMD_T
>));
696 PA_STATE_OPT
clipPA(pDC
,
698 reinterpret_cast<uint8_t*>(&transposedPrims
[0]),
704 clipPA
.viewportArrayActive
= pa
.viewportArrayActive
;
705 clipPA
.rtArrayActive
= pa
.rtArrayActive
;
707 static const uint32_t primMaskMap
[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
709 const uint32_t primMask
= primMaskMap
[numEmittedPrims
];
711 const Integer
<SIMD_T
> primID
= SIMD_T::set1_epi32(pPrimitiveId
[inputPrim
]);
712 const Integer
<SIMD_T
> viewportIdx
= SIMD_T::set1_epi32(pViewportIdx
[inputPrim
]);
713 const Integer
<SIMD_T
> rtIdx
= SIMD_T::set1_epi32(pRtIdx
[inputPrim
]);
715 while (clipPA
.GetNextStreamOutput())
719 Vec4
<SIMD_T
> attrib
[NumVertsPerPrimT
];
721 bool assemble
= clipPA
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
726 pDC
, clipPA
, workerId
, attrib
, primMask
, primID
, viewportIdx
, rtIdx
);
729 } while (clipPA
.NextPrim());
733 // update global pipeline stat
734 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
737 void ExecuteStage(PA_STATE
& pa
,
740 Integer
<SIMD_T
> const& primId
,
741 Integer
<SIMD_T
> const& viewportIdx
,
742 Integer
<SIMD_T
> const& rtIdx
)
744 SWR_ASSERT(pa
.pDC
!= nullptr);
746 BinnerChooser
<SIMD_T
> binner(pa
.binTopology
,
747 pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
749 // update clipper invocations pipeline stat
750 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
751 UPDATE_STAT_FE(CInvocations
, numInvoc
);
753 ComputeClipCodes(prim
, viewportIdx
);
755 // cull prims with NAN coords
756 primMask
&= ~ComputeNaNMask(prim
);
758 // user cull distance cull
759 if (state
.backendState
.cullDistanceMask
| state
.backendState
.clipDistanceMask
)
761 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
764 Float
<SIMD_T
> clipIntersection
= ComputeClipCodeIntersection();
765 // Mask out non-frustum codes
766 clipIntersection
= SIMD_T::and_ps(clipIntersection
,
767 SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK
)));
769 // cull prims outside view frustum
771 primMask
& SimdHelper
<SIMD_T
>::cmpeq_ps_mask(clipIntersection
, SIMD_T::setzero_ps());
773 // skip clipping for points
774 uint32_t clipMask
= 0;
775 if (NumVertsPerPrimT
!= 1)
777 clipMask
= validMask
& ComputeClipMask();
780 AR_EVENT(ClipInfoEvent(numInvoc
, validMask
, clipMask
));
784 RDTSC_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
785 // we have to clip tris, execute the clipper, which will also
788 SIMD_T::vmask_ps(validMask
),
789 SIMD_T::vmask_ps(clipMask
),
794 RDTSC_END(FEGuardbandClip
, 1);
798 // update CPrimitives pipeline state
799 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
801 // forward valid prims directly to binner
803 this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
, viewportIdx
, rtIdx
);
808 Float
<SIMD_T
> ComputeInterpFactor(Float
<SIMD_T
> const& boundaryCoord0
,
809 Float
<SIMD_T
> const& boundaryCoord1
)
811 return SIMD_T::div_ps(boundaryCoord0
, SIMD_T::sub_ps(boundaryCoord0
, boundaryCoord1
));
815 ComputeOffsets(uint32_t attrib
, Integer
<SIMD_T
> const& vIndices
, uint32_t component
)
817 const uint32_t simdVertexStride
= sizeof(SIMDVERTEX_T
<SIMD_T
>);
818 const uint32_t componentStride
= sizeof(Float
<SIMD_T
>);
819 const uint32_t attribStride
= sizeof(Vec4
<SIMD_T
>);
821 static const OSALIGNSIMD16(uint32_t) elemOffset
[16] = {
840 static_assert(sizeof(Integer
<SIMD_T
>) <= sizeof(elemOffset
),
841 "Clipper::ComputeOffsets, Increase number of element offsets.");
843 Integer
<SIMD_T
> vElemOffset
=
844 SIMD_T::loadu_si(reinterpret_cast<const Integer
<SIMD_T
>*>(elemOffset
));
846 // step to the simdvertex
847 Integer
<SIMD_T
> vOffsets
=
848 SIMD_T::mullo_epi32(vIndices
, SIMD_T::set1_epi32(simdVertexStride
));
850 // step to the attribute and component
851 vOffsets
= SIMD_T::add_epi32(
852 vOffsets
, SIMD_T::set1_epi32(attribStride
* attrib
+ componentStride
* component
));
855 vOffsets
= SIMD_T::add_epi32(vOffsets
, vElemOffset
);
860 Float
<SIMD_T
> GatherComponent(const float* pBuffer
,
862 Float
<SIMD_T
> const& vMask
,
863 Integer
<SIMD_T
> const& vIndices
,
866 Integer
<SIMD_T
> vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
867 Float
<SIMD_T
> vSrc
= SIMD_T::setzero_ps();
869 return SIMD_T::mask_i32gather_ps(vSrc
, pBuffer
, vOffsets
, vMask
);
872 void ScatterComponent(const float* pBuffer
,
874 Float
<SIMD_T
> const& vMask
,
875 Integer
<SIMD_T
> const& vIndices
,
877 Float
<SIMD_T
> const& vSrc
)
879 Integer
<SIMD_T
> vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
881 const uint32_t* pOffsets
= reinterpret_cast<const uint32_t*>(&vOffsets
);
882 const float* pSrc
= reinterpret_cast<const float*>(&vSrc
);
883 uint32_t mask
= SIMD_T::movemask_ps(vMask
);
885 while (_BitScanForward(&lane
, mask
))
887 mask
&= ~(1 << lane
);
888 const uint8_t* pBuf
= reinterpret_cast<const uint8_t*>(pBuffer
) + pOffsets
[lane
];
889 *(float*)pBuf
= pSrc
[lane
];
893 template <SWR_CLIPCODES ClippingPlane
>
894 void intersect(const Float
<SIMD_T
>& vActiveMask
, // active lanes to operate on
895 const Integer
<SIMD_T
>& s
, // index to first edge vertex v0 in pInPts.
896 const Integer
<SIMD_T
>& p
, // index to second edge vertex v1 in pInPts.
897 const Vec4
<SIMD_T
>& v1
, // vertex 0 position
898 const Vec4
<SIMD_T
>& v2
, // vertex 1 position
899 Integer
<SIMD_T
>& outIndex
, // output index.
900 const float* pInVerts
, // array of all the input positions.
901 uint32_t numInAttribs
, // number of attributes per vertex.
902 float* pOutVerts
) // array of output positions. We'll write our new intersection
905 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
906 uint32_t vertexClipCullOffset
= this->state
.backendState
.vertexClipCullOffset
;
908 // compute interpolation factor
910 switch (ClippingPlane
)
913 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[0]), SIMD_T::add_ps(v2
[3], v2
[0]));
916 t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[0]), SIMD_T::sub_ps(v2
[3], v2
[0]));
919 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[1]), SIMD_T::add_ps(v2
[3], v2
[1]));
922 t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[1]), SIMD_T::sub_ps(v2
[3], v2
[1]));
925 // DX Znear plane is 0, GL is -w
926 if (this->state
.rastState
.clipHalfZ
)
928 t
= ComputeInterpFactor(v1
[2], v2
[2]);
932 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[2]), SIMD_T::add_ps(v2
[3], v2
[2]));
936 t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[2]), SIMD_T::sub_ps(v2
[3], v2
[2]));
939 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
942 // interpolate position and store
943 for (uint32_t c
= 0; c
< 4; ++c
)
945 Float
<SIMD_T
> vOutPos
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
946 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
949 // interpolate attributes and store
950 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
952 uint32_t attribSlot
= vertexAttribOffset
+ a
;
953 for (uint32_t c
= 0; c
< 4; ++c
)
955 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
956 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
957 Float
<SIMD_T
> vOutAttrib
=
958 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
959 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
963 // interpolate clip distance if enabled
964 if (this->state
.backendState
.clipDistanceMask
& 0xf)
966 uint32_t attribSlot
= vertexClipCullOffset
;
967 for (uint32_t c
= 0; c
< 4; ++c
)
969 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
970 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
971 Float
<SIMD_T
> vOutAttrib
=
972 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
973 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
977 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
979 uint32_t attribSlot
= vertexClipCullOffset
+ 1;
980 for (uint32_t c
= 0; c
< 4; ++c
)
982 Float
<SIMD_T
> vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
983 Float
<SIMD_T
> vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
984 Float
<SIMD_T
> vOutAttrib
=
985 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
986 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
991 template <SWR_CLIPCODES ClippingPlane
>
992 Float
<SIMD_T
> inside(const Vec4
<SIMD_T
>& v
)
994 switch (ClippingPlane
)
997 return SIMD_T::cmpge_ps(v
[0], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
999 return SIMD_T::cmple_ps(v
[0], v
[3]);
1001 return SIMD_T::cmpge_ps(v
[1], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
1002 case FRUSTUM_BOTTOM
:
1003 return SIMD_T::cmple_ps(v
[1], v
[3]);
1005 return SIMD_T::cmpge_ps(v
[2],
1006 this->state
.rastState
.clipHalfZ
1007 ? SIMD_T::setzero_ps()
1008 : SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
1010 return SIMD_T::cmple_ps(v
[2], v
[3]);
1012 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
1013 return SIMD_T::setzero_ps();
1017 template <SWR_CLIPCODES ClippingPlane
>
1018 Integer
<SIMD_T
> ClipTriToPlane(const float* pInVerts
,
1019 const Integer
<SIMD_T
>& vNumInPts
,
1020 uint32_t numInAttribs
,
1023 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1025 Integer
<SIMD_T
> vCurIndex
= SIMD_T::setzero_si();
1026 Integer
<SIMD_T
> vOutIndex
= SIMD_T::setzero_si();
1027 Float
<SIMD_T
> vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1029 while (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
1031 Integer
<SIMD_T
> s
= vCurIndex
;
1032 Integer
<SIMD_T
> p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
1033 Integer
<SIMD_T
> underFlowMask
= SIMD_T::cmpgt_epi32(vNumInPts
, p
);
1034 p
= SIMD_T::castps_si(SIMD_T::blendv_ps(
1035 SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p
), SIMD_T::castsi_ps(underFlowMask
)));
1038 Vec4
<SIMD_T
> vInPos0
, vInPos1
;
1039 for (uint32_t c
= 0; c
< 4; ++c
)
1041 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1042 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1045 // compute inside mask
1046 Float
<SIMD_T
> s_in
= inside
<ClippingPlane
>(vInPos0
);
1047 Float
<SIMD_T
> p_in
= inside
<ClippingPlane
>(vInPos1
);
1049 // compute intersection mask (s_in != p_in)
1050 Float
<SIMD_T
> intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
1051 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
1053 // store s if inside
1054 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
1055 if (!SIMD_T::testz_ps(s_in
, s_in
))
1058 for (uint32_t c
= 0; c
< 4; ++c
)
1061 pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1065 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1067 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1068 for (uint32_t c
= 0; c
< 4; ++c
)
1070 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1071 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1075 // store clip distance if enabled
1076 uint32_t vertexClipCullSlot
= this->state
.backendState
.vertexClipCullOffset
;
1077 if (this->state
.backendState
.clipDistanceMask
& 0xf)
1079 uint32_t attribSlot
= vertexClipCullSlot
;
1080 for (uint32_t c
= 0; c
< 4; ++c
)
1082 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1083 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1087 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
1089 uint32_t attribSlot
= vertexClipCullSlot
+ 1;
1090 for (uint32_t c
= 0; c
< 4; ++c
)
1092 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1093 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1097 // increment outIndex
1098 vOutIndex
= SIMD_T::blendv_epi32(
1099 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1102 // compute and store intersection
1103 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1105 intersect
<ClippingPlane
>(intersectMask
,
1115 // increment outIndex for active lanes
1116 vOutIndex
= SIMD_T::blendv_epi32(
1117 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1120 // increment loop index and update active mask
1121 vCurIndex
= SIMD_T::add_epi32(vCurIndex
, SIMD_T::set1_epi32(1));
1122 vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1128 template <SWR_CLIPCODES ClippingPlane
>
1129 Integer
<SIMD_T
> ClipLineToPlane(const float* pInVerts
,
1130 const Integer
<SIMD_T
>& vNumInPts
,
1131 uint32_t numInAttribs
,
1134 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1136 Integer
<SIMD_T
> vCurIndex
= SIMD_T::setzero_si();
1137 Integer
<SIMD_T
> vOutIndex
= SIMD_T::setzero_si();
1138 Float
<SIMD_T
> vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1140 if (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
))
1142 Integer
<SIMD_T
> s
= vCurIndex
;
1143 Integer
<SIMD_T
> p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
1146 Vec4
<SIMD_T
> vInPos0
, vInPos1
;
1147 for (uint32_t c
= 0; c
< 4; ++c
)
1149 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1150 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1153 // compute inside mask
1154 Float
<SIMD_T
> s_in
= inside
<ClippingPlane
>(vInPos0
);
1155 Float
<SIMD_T
> p_in
= inside
<ClippingPlane
>(vInPos1
);
1157 // compute intersection mask (s_in != p_in)
1158 Float
<SIMD_T
> intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
1159 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
1161 // store s if inside
1162 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
1163 if (!SIMD_T::testz_ps(s_in
, s_in
))
1165 for (uint32_t c
= 0; c
< 4; ++c
)
1168 pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1171 // interpolate attributes and store
1172 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1174 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1175 for (uint32_t c
= 0; c
< 4; ++c
)
1177 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1178 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1182 // increment outIndex
1183 vOutIndex
= SIMD_T::blendv_epi32(
1184 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1187 // compute and store intersection
1188 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1190 intersect
<ClippingPlane
>(intersectMask
,
1200 // increment outIndex for active lanes
1201 vOutIndex
= SIMD_T::blendv_epi32(
1202 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1205 // store p if inside
1206 p_in
= SIMD_T::and_ps(p_in
, vActiveMask
);
1207 if (!SIMD_T::testz_ps(p_in
, p_in
))
1209 for (uint32_t c
= 0; c
< 4; ++c
)
1212 pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
1215 // interpolate attributes and store
1216 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1218 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1219 for (uint32_t c
= 0; c
< 4; ++c
)
1221 Float
<SIMD_T
> vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
1222 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
1226 // increment outIndex
1227 vOutIndex
= SIMD_T::blendv_epi32(
1228 vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), p_in
);
1235 Integer
<SIMD_T
> ClipPrims(float* pVertices
,
1236 const Float
<SIMD_T
>& vPrimMask
,
1237 const Float
<SIMD_T
>& vClipMask
,
1241 float* pTempVerts
= reinterpret_cast<float*>(this->tmpVerts
);
1243 // zero out num input verts for non-active lanes
1244 Integer
<SIMD_T
> vNumInPts
= SIMD_T::set1_epi32(NumVertsPerPrimT
);
1245 vNumInPts
= SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts
, vClipMask
);
1247 // clip prims to frustum
1248 Integer
<SIMD_T
> vNumOutPts
;
1249 if (NumVertsPerPrimT
== 3)
1251 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1252 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1254 ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1256 ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1258 ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1259 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1263 SWR_ASSERT(NumVertsPerPrimT
== 2);
1265 ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1267 ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1269 ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1271 ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1273 ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1275 ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1278 // restore num verts for non-clipped, active lanes
1279 Float
<SIMD_T
> vNonClippedMask
= SIMD_T::andnot_ps(vClipMask
, vPrimMask
);
1281 SIMD_T::blendv_epi32(vNumOutPts
, SIMD_T::set1_epi32(NumVertsPerPrimT
), vNonClippedMask
);
1286 const uint32_t workerId
{0};
1287 DRAW_CONTEXT
* pDC
{nullptr};
1288 const API_STATE
& state
;
1289 Float
<SIMD_T
> clipCodes
[NumVertsPerPrimT
];
1290 SIMDVERTEX_T
<SIMD_T
>* clippedVerts
;
1291 SIMDVERTEX_T
<SIMD_T
>* tmpVerts
;
1292 SIMDVERTEX_T
<SIMD_T
>* transposedVerts
;
1295 // pipeline stage functions
1296 void ClipRectangles(DRAW_CONTEXT
* pDC
,
1301 simdscalari
const& primId
,
1302 simdscalari
const& viewportIdx
,
1303 simdscalari
const& rtIdx
);
1304 void ClipTriangles(DRAW_CONTEXT
* pDC
,
1309 simdscalari
const& primId
,
1310 simdscalari
const& viewportIdx
,
1311 simdscalari
const& rtIdx
);
1312 void ClipLines(DRAW_CONTEXT
* pDC
,
1317 simdscalari
const& primId
,
1318 simdscalari
const& viewportIdx
,
1319 simdscalari
const& rtIdx
);
1320 void ClipPoints(DRAW_CONTEXT
* pDC
,
1325 simdscalari
const& primId
,
1326 simdscalari
const& viewportIdx
,
1327 simdscalari
const& rtIdx
);
1328 #if USE_SIMD16_FRONTEND
1329 void SIMDCALL
ClipRectangles_simd16(DRAW_CONTEXT
* pDC
,
1332 simd16vector prims
[],
1334 simd16scalari
const& primId
,
1335 simd16scalari
const& viewportIdx
,
1336 simd16scalari
const& rtIdx
);
1337 void SIMDCALL
ClipTriangles_simd16(DRAW_CONTEXT
* pDC
,
1340 simd16vector prims
[],
1342 simd16scalari
const& primId
,
1343 simd16scalari
const& viewportIdx
,
1344 simd16scalari
const& rtIdx
);
1345 void SIMDCALL
ClipLines_simd16(DRAW_CONTEXT
* pDC
,
1348 simd16vector prims
[],
1350 simd16scalari
const& primId
,
1351 simd16scalari
const& viewportIdx
,
1352 simd16scalari
const& rtIdx
);
1353 void SIMDCALL
ClipPoints_simd16(DRAW_CONTEXT
* pDC
,
1356 simd16vector prims
[],
1358 simd16scalari
const& primId
,
1359 simd16scalari
const& viewportIdx
,
1360 simd16scalari
const& rtIdx
);