1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T
<SIMD256
> tlsTempVertices
[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T
<SIMD512
> tlsTempVertices_simd16
[7];
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
47 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
48 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
49 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
51 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
52 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
54 NEGW
= (0x40 << CLIPCODE_SHIFT
),
56 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
57 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
58 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
59 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
64 template<typename SIMD_T
>
65 void ComputeClipCodes(const API_STATE
&state
, const typename
SIMD_T::Vec4
&vertex
, typename
SIMD_T::Float
&clipCodes
, typename
SIMD_T::Integer
const &viewportIndexes
)
67 clipCodes
= SIMD_T::setzero_ps();
70 typename
SIMD_T::Float vNegW
= SIMD_T::mul_ps(vertex
.w
,SIMD_T::set1_ps(-1.0f
));
73 typename
SIMD_T::Float vRes
= SIMD_T::cmplt_ps(vertex
.x
, vNegW
);
74 clipCodes
= SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT
)));
77 vRes
= SIMD_T::cmplt_ps(vertex
.y
, vNegW
);
78 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP
))));
81 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, vertex
.w
);
82 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT
))));
85 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, vertex
.w
);
86 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM
))));
88 if (state
.rastState
.depthClipEnable
)
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state
.rastState
.clipHalfZ
)
94 vRes
= SIMD_T::cmplt_ps(vertex
.z
, SIMD_T::setzero_ps());
98 vRes
= SIMD_T::cmplt_ps(vertex
.z
, vNegW
);
100 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR
))));
103 vRes
= SIMD_T::cmpgt_ps(vertex
.z
, vertex
.w
);
104 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR
))));
108 vRes
= SIMD_T::cmple_ps(vertex
.w
, SIMD_T::setzero_ps());
109 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW
))));
112 typename
SIMD_T::Float gbMult
= SIMD_T::mul_ps(vNegW
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.left
[0], viewportIndexes
));
113 vRes
= SIMD_T::cmplt_ps(vertex
.x
, gbMult
);
114 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT
))));
117 gbMult
= SIMD_T::mul_ps(vNegW
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.top
[0], viewportIndexes
));
118 vRes
= SIMD_T::cmplt_ps(vertex
.y
, gbMult
);
119 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP
))));
122 gbMult
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.right
[0], viewportIndexes
));
123 vRes
= SIMD_T::cmpgt_ps(vertex
.x
, gbMult
);
124 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT
))));
127 gbMult
= SIMD_T::mul_ps(vertex
.w
, SIMD_T::template i32gather_ps
<typename
SIMD_T::ScaleFactor(4)>(&state
.gbState
.bottom
[0], viewportIndexes
));
128 vRes
= SIMD_T::cmpgt_ps(vertex
.y
, gbMult
);
129 clipCodes
= SIMD_T::or_ps(clipCodes
, SIMD_T::and_ps(vRes
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM
))));
132 template<typename SIMD_T
>
138 struct BinnerChooser
<SIMD256
>
140 PFN_PROCESS_PRIMS pfnBinFunc
;
142 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
145 if (numVertsPerPrim
== 3)
147 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
150 else if (numVertsPerPrim
== 2)
152 pfnBinFunc
= BinLines
;
156 SWR_ASSERT(0 && "Unexpected points in clipper.");
160 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
166 pfnBinFunc
= BinPoints
;
171 case TOP_LINE_LIST_ADJ
:
172 case TOP_LISTSTRIP_ADJ
:
173 pfnBinFunc
= BinLines
;
176 pfnBinFunc
= GetBinTrianglesFunc(conservativeRast
> 0);
181 void BinFunc(DRAW_CONTEXT
*pDC
, PA_STATE
&pa
, uint32_t workerId
, SIMD256::Vec4 prims
[], uint32_t primMask
, SIMD256::Integer
const &primID
)
183 SWR_ASSERT(pfnBinFunc
!= nullptr);
185 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
);
189 #if USE_SIMD16_FRONTEND
191 struct BinnerChooser
<SIMD512
>
193 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc
;
195 BinnerChooser(uint32_t numVertsPerPrim
, uint32_t conservativeRast
)
198 if (numVertsPerPrim
== 3)
200 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
203 else if (numVertsPerPrim
== 2)
205 pfnBinFunc
= BinLines_simd16
;
209 SWR_ASSERT(0 && "Unexpected points in clipper.");
213 BinnerChooser(PRIMITIVE_TOPOLOGY topology
, uint32_t conservativeRast
)
219 pfnBinFunc
= BinPoints_simd16
;
224 case TOP_LINE_LIST_ADJ
:
225 case TOP_LISTSTRIP_ADJ
:
226 pfnBinFunc
= BinLines_simd16
;
229 pfnBinFunc
= GetBinTrianglesFunc_simd16(conservativeRast
> 0);
234 void BinFunc(DRAW_CONTEXT
*pDC
, PA_STATE
&pa
, uint32_t workerId
, SIMD512::Vec4 prims
[], uint32_t primMask
, SIMD512::Integer
const &primID
)
236 SWR_ASSERT(pfnBinFunc
!= nullptr);
238 pfnBinFunc(pDC
, pa
, workerId
, prims
, primMask
, primID
);
243 template<typename SIMD_T
>
249 struct SimdHelper
<SIMD256
>
251 static SIMD256::Float
insert_lo_ps(SIMD256::Float a
)
256 static SIMD256::Mask
cmpeq_ps_mask(SIMD256::Float a
, SIMD256::Float b
)
258 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a
, b
));
262 #if USE_SIMD16_FRONTEND
264 struct SimdHelper
<SIMD512
>
266 static SIMD512::Float
insert_lo_ps(SIMD256::Float a
)
268 return SIMD512::insert_ps
<0>(SIMD512::setzero_ps(), a
);
271 static SIMD512::Mask
cmpeq_ps_mask(SIMD512::Float a
, SIMD512::Float b
)
273 return SIMD512::cmp_ps_mask
<SIMD16::CompareType::EQ_OQ
>(a
, b
);
278 // Temp storage used by the clipper
279 template<typename SIMD_T
>
285 struct ClipHelper
<SIMD256
>
287 static SIMDVERTEX_T
<SIMD256
> *GetTempVertices()
289 return tlsTempVertices
;
293 #if USE_SIMD16_FRONTEND
295 struct ClipHelper
<SIMD512
>
297 static SIMDVERTEX_T
<SIMD512
> *GetTempVertices()
299 return tlsTempVertices_simd16
;
304 template<typename SIMD_T
, uint32_t NumVertsPerPrim
>
308 INLINE
Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
309 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
311 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
314 void ComputeClipCodes(typename
SIMD_T::Vec4 vertex
[], const typename
SIMD_T::Integer
&viewportIndexes
)
316 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
318 ::ComputeClipCodes
<SIMD_T
>(state
, vertex
[i
], clipCodes
[i
], viewportIndexes
);
322 typename
SIMD_T::Float
ComputeClipCodeIntersection()
324 typename
SIMD_T::Float result
= clipCodes
[0];
326 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
328 result
= SIMD_T::and_ps(result
, clipCodes
[i
]);
334 typename
SIMD_T::Float
ComputeClipCodeUnion()
336 typename
SIMD_T::Float result
= clipCodes
[0];
338 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
340 result
= SIMD_T::or_ps(result
, clipCodes
[i
]);
346 int ComputeClipMask()
348 typename
SIMD_T::Float clipUnion
= ComputeClipCodeUnion();
350 clipUnion
= SIMD_T::and_ps(clipUnion
, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK
)));
352 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion
, SIMD_T::setzero_ps()));
355 // clipper is responsible for culling any prims with NAN coordinates
356 int ComputeNaNMask(typename
SIMD_T::Vec4 prim
[])
358 typename
SIMD_T::Float vNanMask
= SIMD_T::setzero_ps();
360 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
362 typename
SIMD_T::Float vNan01
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[0], prim
[e
].v
[1]);
363 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan01
);
365 typename
SIMD_T::Float vNan23
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(prim
[e
].v
[2], prim
[e
].v
[3]);
366 vNanMask
= SIMD_T::or_ps(vNanMask
, vNan23
);
369 return SIMD_T::movemask_ps(vNanMask
);
372 int ComputeUserClipCullMask(PA_STATE
&pa
, typename
SIMD_T::Vec4 prim
[])
374 uint8_t cullMask
= state
.backendState
.cullDistanceMask
;
375 uint32_t vertexClipCullOffset
= state
.backendState
.vertexClipCullOffset
;
377 typename
SIMD_T::Float vClipCullMask
= SIMD_T::setzero_ps();
379 typename
SIMD_T::Vec4 vClipCullDistLo
[3];
380 typename
SIMD_T::Vec4 vClipCullDistHi
[3];
382 pa
.Assemble(vertexClipCullOffset
, vClipCullDistLo
);
383 pa
.Assemble(vertexClipCullOffset
+ 1, vClipCullDistHi
);
386 while (_BitScanForward(&index
, cullMask
))
388 cullMask
&= ~(1 << index
);
389 uint32_t slot
= index
>> 2;
390 uint32_t component
= index
& 0x3;
392 typename
SIMD_T::Float vCullMaskElem
= SIMD_T::set1_ps(-1.0f
);
393 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
395 typename
SIMD_T::Float vCullComp
;
398 vCullComp
= vClipCullDistLo
[e
][component
];
402 vCullComp
= vClipCullDistHi
[e
][component
];
405 // cull if cull distance < 0 || NAN
406 typename
SIMD_T::Float vCull
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::NLE_UQ
>(SIMD_T::setzero_ps(), vCullComp
);
407 vCullMaskElem
= SIMD_T::and_ps(vCullMaskElem
, vCull
);
409 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vCullMaskElem
);
412 // clipper should also discard any primitive with NAN clip distance
413 uint8_t clipMask
= state
.backendState
.clipDistanceMask
;
414 while (_BitScanForward(&index
, clipMask
))
416 clipMask
&= ~(1 << index
);
417 uint32_t slot
= index
>> 2;
418 uint32_t component
= index
& 0x3;
420 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
422 typename
SIMD_T::Float vClipComp
;
425 vClipComp
= vClipCullDistLo
[e
][component
];
429 vClipComp
= vClipCullDistHi
[e
][component
];
432 typename
SIMD_T::Float vClip
= SIMD_T::template cmp_ps
<SIMD_T::CompareType::UNORD_Q
>(vClipComp
, vClipComp
);
433 vClipCullMask
= SIMD_T::or_ps(vClipCullMask
, vClip
);
437 return SIMD_T::movemask_ps(vClipCullMask
);
440 void ClipSimd(const typename
SIMD_T::Float
&vPrimMask
, const typename
SIMD_T::Float
&vClipMask
, PA_STATE
&pa
, const typename
SIMD_T::Integer
&vPrimId
)
442 // input/output vertex store for clipper
443 SIMDVERTEX_T
<SIMD_T
> vertices
[7]; // maximum 7 verts generated per triangle
445 uint32_t constantInterpMask
= state
.backendState
.constantInterpolationMask
;
446 uint32_t provokingVertex
= 0;
447 if (pa
.binTopology
== TOP_TRIANGLE_FAN
)
449 provokingVertex
= state
.frontendState
.provokingVertex
.triFan
;
451 ///@todo: line topology for wireframe?
454 typename
SIMD_T::Vec4 tmpVector
[NumVertsPerPrim
];
455 pa
.Assemble(VERTEX_POSITION_SLOT
, tmpVector
);
456 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
458 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = tmpVector
[i
];
462 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
464 int32_t maxSlot
= -1;
465 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
467 // Compute absolute attrib slot in vertex array
468 uint32_t mapSlot
= backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
469 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
470 uint32_t inputSlot
= backendState
.vertexAttribOffset
+ mapSlot
;
472 pa
.Assemble(inputSlot
, tmpVector
);
474 // if constant interpolation enabled for this attribute, assign the provoking
475 // vertex values to all edges
476 if (CheckBit(constantInterpMask
, slot
))
478 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
480 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
485 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
487 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
492 // assemble user clip distances if enabled
493 uint32_t vertexClipCullSlot
= state
.backendState
.vertexClipCullOffset
;
494 if (state
.backendState
.clipDistanceMask
& 0xf)
496 pa
.Assemble(vertexClipCullSlot
, tmpVector
);
497 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
499 vertices
[i
].attrib
[vertexClipCullSlot
] = tmpVector
[i
];
503 if (state
.backendState
.clipDistanceMask
& 0xf0)
505 pa
.Assemble(vertexClipCullSlot
+ 1, tmpVector
);
506 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
508 vertices
[i
].attrib
[vertexClipCullSlot
+ 1] = tmpVector
[i
];
512 uint32_t numAttribs
= maxSlot
+ 1;
514 typename
SIMD_T::Integer vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
516 BinnerChooser
<SIMD_T
> binner(NumVertsPerPrim
, pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
518 // set up new PA for binning clipped primitives
519 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
520 if (NumVertsPerPrim
== 3)
522 clipTopology
= TOP_TRIANGLE_FAN
;
524 // so that the binner knows to bloat wide points later
525 if (pa
.binTopology
== TOP_POINT_LIST
)
527 clipTopology
= TOP_POINT_LIST
;
530 else if (NumVertsPerPrim
== 2)
532 clipTopology
= TOP_LINE_LIST
;
536 SWR_ASSERT(0 && "Unexpected points in clipper.");
539 const uint32_t *pVertexCount
= reinterpret_cast<const uint32_t *>(&vNumClippedVerts
);
540 const uint32_t *pPrimitiveId
= reinterpret_cast<const uint32_t *>(&vPrimId
);
542 const SIMD256::Integer vOffsets
= SIMD256::set_epi32(
543 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>), // unused lane
544 6 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
545 5 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
546 4 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
547 3 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
548 2 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
549 1 * sizeof(SIMDVERTEX_T
<SIMD_T
>),
550 0 * sizeof(SIMDVERTEX_T
<SIMD_T
>));
552 // only need to gather 7 verts
553 // @todo dynamic mask based on actual # of verts generated per lane
554 const SIMD256::Float vMask
= SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
556 uint32_t numClippedPrims
= 0;
558 // tranpose clipper output so that each lane's vertices are in SIMD order
559 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
563 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
564 SIMDVERTEX_T
<SIMD_T
> *transposedPrims
= reinterpret_cast<SIMDVERTEX_T
<SIMD_T
> *>(AlignedMalloc(sizeof(SIMDVERTEX_T
<SIMD_T
>) * 2, 64));
567 SIMDVERTEX_T
<SIMD_T
> transposedPrims
[2];
570 for (uint32_t inputPrim
= 0; inputPrim
< pa
.NumPrims(); ++inputPrim
)
572 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
573 if (numEmittedVerts
< NumVertsPerPrim
)
577 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
579 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
580 SWR_ASSERT(numEmittedPrims
<= 7, "Unexpected primitive count from clipper.");
582 numClippedPrims
+= numEmittedPrims
;
584 // tranpose clipper output so that each lane's vertices are in SIMD order
585 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
589 uint8_t *pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
592 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
593 static const float *dummy
= reinterpret_cast<const float *>(pBase
);
596 for (uint32_t c
= 0; c
< 4; ++c
)
598 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
599 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
600 pBase
+= sizeof(typename
SIMD_T::Float
);
604 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[backendState
.vertexAttribOffset
]) + sizeof(float) * inputPrim
;
606 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
608 uint32_t attribSlot
= backendState
.vertexAttribOffset
+ attrib
;
610 for (uint32_t c
= 0; c
< 4; ++c
)
612 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
613 transposedPrims
[0].attrib
[attribSlot
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
614 pBase
+= sizeof(typename
SIMD_T::Float
);
618 // transpose user clip distances if enabled
619 uint32_t vertexClipCullSlot
= backendState
.vertexClipCullOffset
;
620 if (state
.backendState
.clipDistanceMask
& 0x0f)
622 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[vertexClipCullSlot
]) + sizeof(float) * inputPrim
;
624 for (uint32_t c
= 0; c
< 4; ++c
)
626 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
627 transposedPrims
[0].attrib
[vertexClipCullSlot
][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
628 pBase
+= sizeof(typename
SIMD_T::Float
);
632 if (state
.backendState
.clipDistanceMask
& 0xf0)
634 pBase
= reinterpret_cast<uint8_t *>(&vertices
[0].attrib
[vertexClipCullSlot
+ 1]) + sizeof(float) * inputPrim
;
636 for (uint32_t c
= 0; c
< 4; ++c
)
638 SIMD256::Float temp
= SIMD256::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase
), vOffsets
, vMask
);
639 transposedPrims
[0].attrib
[vertexClipCullSlot
+ 1][c
] = SimdHelper
<SIMD_T
>::insert_lo_ps(temp
);
640 pBase
+= sizeof(typename
SIMD_T::Float
);
644 PA_STATE_OPT
clipPA(pDC
, numEmittedPrims
, reinterpret_cast<uint8_t *>(&transposedPrims
[0]), numEmittedVerts
, SWR_VTX_NUM_SLOTS
, true, NumVertsPerPrim
, clipTopology
);
646 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
648 const uint32_t primMask
= primMaskMap
[numEmittedPrims
];
650 const typename
SIMD_T::Integer primID
= SIMD_T::set1_epi32(pPrimitiveId
[inputPrim
]);
652 while (clipPA
.GetNextStreamOutput())
656 typename
SIMD_T::Vec4 attrib
[NumVertsPerPrim
];
658 bool assemble
= clipPA
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
662 binner
.pfnBinFunc(pDC
, clipPA
, workerId
, attrib
, primMask
, primID
);
665 } while (clipPA
.NextPrim());
670 AlignedFree(transposedPrims
);
673 // update global pipeline stat
674 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
677 void ExecuteStage(PA_STATE
&pa
, typename
SIMD_T::Vec4 prim
[], uint32_t primMask
, typename
SIMD_T::Integer
const &primId
)
679 SWR_ASSERT(pa
.pDC
!= nullptr);
681 SWR_CONTEXT
*pContext
= pa
.pDC
->pContext
;
683 BinnerChooser
<SIMD_T
> binner(pa
.binTopology
, pa
.pDC
->pState
->state
.rastState
.conservativeRast
);
685 // update clipper invocations pipeline stat
686 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
687 UPDATE_STAT_FE(CInvocations
, numInvoc
);
689 // Read back viewport index if required
690 typename
SIMD_T::Integer viewportIdx
= SIMD_T::setzero_si();
691 typename
SIMD_T::Vec4 vpiAttrib
[NumVertsPerPrim
];
692 typename
SIMD_T::Integer vpai
= SIMD_T::setzero_si();
694 if (state
.backendState
.readViewportArrayIndex
)
696 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
698 vpai
= SIMD_T::castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
702 if (state
.backendState
.readViewportArrayIndex
) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
704 // OOB indices => forced to zero.
705 vpai
= SIMD_T::max_epi32(vpai
, SIMD_T::setzero_si());
706 typename
SIMD_T::Integer vNumViewports
= SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
707 typename
SIMD_T::Integer vClearMask
= SIMD_T::cmplt_epi32(vpai
, vNumViewports
);
708 viewportIdx
= SIMD_T::and_si(vClearMask
, vpai
);
711 ComputeClipCodes(prim
, viewportIdx
);
713 // cull prims with NAN coords
714 primMask
&= ~ComputeNaNMask(prim
);
716 // user cull distance cull
717 if (state
.backendState
.cullDistanceMask
)
719 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
722 // cull prims outside view frustum
723 typename
SIMD_T::Float clipIntersection
= ComputeClipCodeIntersection();
724 int validMask
= primMask
& SimdHelper
<SIMD_T
>::cmpeq_ps_mask(clipIntersection
, SIMD_T::setzero_ps());
726 // skip clipping for points
727 uint32_t clipMask
= 0;
728 if (NumVertsPerPrim
!= 1)
730 clipMask
= primMask
& ComputeClipMask();
735 AR_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
736 // we have to clip tris, execute the clipper, which will also
738 ClipSimd(SIMD_T::vmask_ps(primMask
), SIMD_T::vmask_ps(clipMask
), pa
, primId
);
739 AR_END(FEGuardbandClip
, 1);
743 // update CPrimitives pipeline state
744 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
746 // forward valid prims directly to binner
747 binner
.pfnBinFunc(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
);
752 typename
SIMD_T::Float
ComputeInterpFactor(typename
SIMD_T::Float
const &boundaryCoord0
, typename
SIMD_T::Float
const &boundaryCoord1
)
754 return SIMD_T::div_ps(boundaryCoord0
, SIMD_T::sub_ps(boundaryCoord0
, boundaryCoord1
));
757 typename
SIMD_T::Integer
ComputeOffsets(uint32_t attrib
, typename
SIMD_T::Integer
const &vIndices
, uint32_t component
)
759 const uint32_t simdVertexStride
= sizeof(SIMDVERTEX_T
<SIMD_T
>);
760 const uint32_t componentStride
= sizeof(typename
SIMD_T::Float
);
761 const uint32_t attribStride
= sizeof(typename
SIMD_T::Vec4
);
763 static const OSALIGNSIMD16(uint32_t) elemOffset
[16] =
783 static_assert(sizeof(typename
SIMD_T::Integer
) <= sizeof(elemOffset
), "Clipper::ComputeOffsets, Increase number of element offsets.");
785 typename
SIMD_T::Integer vElemOffset
= SIMD_T::loadu_si(reinterpret_cast<const typename
SIMD_T::Integer
*>(elemOffset
));
787 // step to the simdvertex
788 typename
SIMD_T::Integer vOffsets
= SIMD_T::mullo_epi32(vIndices
, SIMD_T::set1_epi32(simdVertexStride
));
790 // step to the attribute and component
791 vOffsets
= SIMD_T::add_epi32(vOffsets
, SIMD_T::set1_epi32(attribStride
* attrib
+ componentStride
* component
));
794 vOffsets
= SIMD_T::add_epi32(vOffsets
, vElemOffset
);
799 typename
SIMD_T::Float
GatherComponent(const float* pBuffer
, uint32_t attrib
, typename
SIMD_T::Float
const &vMask
, typename
SIMD_T::Integer
const &vIndices
, uint32_t component
)
801 typename
SIMD_T::Integer vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
802 typename
SIMD_T::Float vSrc
= SIMD_T::setzero_ps();
804 return SIMD_T::template mask_i32gather_ps
<typename
SIMD_T::ScaleFactor(1)>(vSrc
, pBuffer
, vOffsets
, vMask
);
807 void ScatterComponent(const float* pBuffer
, uint32_t attrib
, typename
SIMD_T::Float
const &vMask
, typename
SIMD_T::Integer
const &vIndices
, uint32_t component
, typename
SIMD_T::Float
const &vSrc
)
809 typename
SIMD_T::Integer vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
811 const uint32_t *pOffsets
= reinterpret_cast<const uint32_t *>(&vOffsets
);
812 const float *pSrc
= reinterpret_cast<const float *>(&vSrc
);
813 uint32_t mask
= SIMD_T::movemask_ps(vMask
);
815 while (_BitScanForward(&lane
, mask
))
817 mask
&= ~(1 << lane
);
818 const uint8_t *pBuf
= reinterpret_cast<const uint8_t *>(pBuffer
) + pOffsets
[lane
];
819 *(float *)pBuf
= pSrc
[lane
];
823 template<SWR_CLIPCODES ClippingPlane
>
825 const typename
SIMD_T::Float
&vActiveMask
, // active lanes to operate on
826 const typename
SIMD_T::Integer
&s
, // index to first edge vertex v0 in pInPts.
827 const typename
SIMD_T::Integer
&p
, // index to second edge vertex v1 in pInPts.
828 const typename
SIMD_T::Vec4
&v1
, // vertex 0 position
829 const typename
SIMD_T::Vec4
&v2
, // vertex 1 position
830 typename
SIMD_T::Integer
&outIndex
, // output index.
831 const float *pInVerts
, // array of all the input positions.
832 uint32_t numInAttribs
, // number of attributes per vertex.
833 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
835 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
836 uint32_t vertexClipCullOffset
= this->state
.backendState
.vertexClipCullOffset
;
838 // compute interpolation factor
839 typename
SIMD_T::Float t
;
840 switch (ClippingPlane
)
842 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[0]), SIMD_T::add_ps(v2
[3], v2
[0])); break;
843 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[0]), SIMD_T::sub_ps(v2
[3], v2
[0])); break;
844 case FRUSTUM_TOP
: t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[1]), SIMD_T::add_ps(v2
[3], v2
[1])); break;
845 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[1]), SIMD_T::sub_ps(v2
[3], v2
[1])); break;
847 // DX Znear plane is 0, GL is -w
848 if (this->state
.rastState
.clipHalfZ
)
850 t
= ComputeInterpFactor(v1
[2], v2
[2]);
854 t
= ComputeInterpFactor(SIMD_T::add_ps(v1
[3], v1
[2]), SIMD_T::add_ps(v2
[3], v2
[2]));
857 case FRUSTUM_FAR
: t
= ComputeInterpFactor(SIMD_T::sub_ps(v1
[3], v1
[2]), SIMD_T::sub_ps(v2
[3], v2
[2])); break;
858 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
861 // interpolate position and store
862 for (uint32_t c
= 0; c
< 4; ++c
)
864 typename
SIMD_T::Float vOutPos
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
865 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
868 // interpolate attributes and store
869 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
871 uint32_t attribSlot
= vertexAttribOffset
+ a
;
872 for (uint32_t c
= 0; c
< 4; ++c
)
874 typename
SIMD_T::Float vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
875 typename
SIMD_T::Float vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
876 typename
SIMD_T::Float vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
877 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
881 // interpolate clip distance if enabled
882 if (this->state
.backendState
.clipDistanceMask
& 0xf)
884 uint32_t attribSlot
= vertexClipCullOffset
;
885 for (uint32_t c
= 0; c
< 4; ++c
)
887 typename
SIMD_T::Float vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
888 typename
SIMD_T::Float vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
889 typename
SIMD_T::Float vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
890 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
894 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
896 uint32_t attribSlot
= vertexClipCullOffset
+ 1;
897 for (uint32_t c
= 0; c
< 4; ++c
)
899 typename
SIMD_T::Float vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
900 typename
SIMD_T::Float vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
901 typename
SIMD_T::Float vOutAttrib
= SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
902 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
907 template<SWR_CLIPCODES ClippingPlane
>
908 typename
SIMD_T::Float
inside(const typename
SIMD_T::Vec4
&v
)
910 switch (ClippingPlane
)
912 case FRUSTUM_LEFT
: return SIMD_T::cmpge_ps(v
[0], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
913 case FRUSTUM_RIGHT
: return SIMD_T::cmple_ps(v
[0], v
[3]);
914 case FRUSTUM_TOP
: return SIMD_T::cmpge_ps(v
[1], SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
915 case FRUSTUM_BOTTOM
: return SIMD_T::cmple_ps(v
[1], v
[3]);
916 case FRUSTUM_NEAR
: return SIMD_T::cmpge_ps(v
[2], this->state
.rastState
.clipHalfZ
? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v
[3], SIMD_T::set1_ps(-1.0f
)));
917 case FRUSTUM_FAR
: return SIMD_T::cmple_ps(v
[2], v
[3]);
919 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
920 return SIMD_T::setzero_ps();
924 template<SWR_CLIPCODES ClippingPlane
>
925 typename
SIMD_T::Integer
ClipTriToPlane(const float *pInVerts
, const typename
SIMD_T::Integer
&vNumInPts
, uint32_t numInAttribs
, float *pOutVerts
)
927 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
929 typename
SIMD_T::Integer vCurIndex
= SIMD_T::setzero_si();
930 typename
SIMD_T::Integer vOutIndex
= SIMD_T::setzero_si();
931 typename
SIMD_T::Float vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
933 while (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
935 typename
SIMD_T::Integer s
= vCurIndex
;
936 typename
SIMD_T::Integer p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
937 typename
SIMD_T::Integer underFlowMask
= SIMD_T::cmpgt_epi32(vNumInPts
, p
);
938 p
= SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p
), SIMD_T::castsi_ps(underFlowMask
)));
941 typename
SIMD_T::Vec4 vInPos0
, vInPos1
;
942 for (uint32_t c
= 0; c
< 4; ++c
)
944 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
945 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
948 // compute inside mask
949 typename
SIMD_T::Float s_in
= inside
<ClippingPlane
>(vInPos0
);
950 typename
SIMD_T::Float p_in
= inside
<ClippingPlane
>(vInPos1
);
952 // compute intersection mask (s_in != p_in)
953 typename
SIMD_T::Float intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
954 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
957 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
958 if (!SIMD_T::testz_ps(s_in
, s_in
))
961 for (uint32_t c
= 0; c
< 4; ++c
)
963 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
967 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
969 uint32_t attribSlot
= vertexAttribOffset
+ a
;
970 for (uint32_t c
= 0; c
< 4; ++c
)
972 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
973 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
977 // store clip distance if enabled
978 uint32_t vertexClipCullSlot
= this->state
.backendState
.vertexClipCullOffset
;
979 if (this->state
.backendState
.clipDistanceMask
& 0xf)
981 uint32_t attribSlot
= vertexClipCullSlot
;
982 for (uint32_t c
= 0; c
< 4; ++c
)
984 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
985 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
989 if (this->state
.backendState
.clipDistanceMask
& 0xf0)
991 uint32_t attribSlot
= vertexClipCullSlot
+ 1;
992 for (uint32_t c
= 0; c
< 4; ++c
)
994 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
995 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
999 // increment outIndex
1000 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1003 // compute and store intersection
1004 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1006 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1008 // increment outIndex for active lanes
1009 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1012 // increment loop index and update active mask
1013 vCurIndex
= SIMD_T::add_epi32(vCurIndex
, SIMD_T::set1_epi32(1));
1014 vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1020 template<SWR_CLIPCODES ClippingPlane
>
1021 typename
SIMD_T::Integer
ClipLineToPlane(const float *pInVerts
, const typename
SIMD_T::Integer
&vNumInPts
, uint32_t numInAttribs
, float *pOutVerts
)
1023 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1025 typename
SIMD_T::Integer vCurIndex
= SIMD_T::setzero_si();
1026 typename
SIMD_T::Integer vOutIndex
= SIMD_T::setzero_si();
1027 typename
SIMD_T::Float vActiveMask
= SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex
, vNumInPts
));
1029 if (!SIMD_T::testz_ps(vActiveMask
, vActiveMask
))
1031 typename
SIMD_T::Integer s
= vCurIndex
;
1032 typename
SIMD_T::Integer p
= SIMD_T::add_epi32(s
, SIMD_T::set1_epi32(1));
1035 typename
SIMD_T::Vec4 vInPos0
, vInPos1
;
1036 for (uint32_t c
= 0; c
< 4; ++c
)
1038 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1039 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1042 // compute inside mask
1043 typename
SIMD_T::Float s_in
= inside
<ClippingPlane
>(vInPos0
);
1044 typename
SIMD_T::Float p_in
= inside
<ClippingPlane
>(vInPos1
);
1046 // compute intersection mask (s_in != p_in)
1047 typename
SIMD_T::Float intersectMask
= SIMD_T::xor_ps(s_in
, p_in
);
1048 intersectMask
= SIMD_T::and_ps(intersectMask
, vActiveMask
);
1050 // store s if inside
1051 s_in
= SIMD_T::and_ps(s_in
, vActiveMask
);
1052 if (!SIMD_T::testz_ps(s_in
, s_in
))
1054 for (uint32_t c
= 0; c
< 4; ++c
)
1056 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1059 // interpolate attributes and store
1060 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1062 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1063 for (uint32_t c
= 0; c
< 4; ++c
)
1065 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1066 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1070 // increment outIndex
1071 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), s_in
);
1074 // compute and store intersection
1075 if (!SIMD_T::testz_ps(intersectMask
, intersectMask
))
1077 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1079 // increment outIndex for active lanes
1080 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), intersectMask
);
1083 // store p if inside
1084 p_in
= SIMD_T::and_ps(p_in
, vActiveMask
);
1085 if (!SIMD_T::testz_ps(p_in
, p_in
))
1087 for (uint32_t c
= 0; c
< 4; ++c
)
1089 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
1092 // interpolate attributes and store
1093 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1095 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1096 for (uint32_t c
= 0; c
< 4; ++c
)
1098 typename
SIMD_T::Float vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
1099 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
1103 // increment outIndex
1104 vOutIndex
= SIMD_T::blendv_epi32(vOutIndex
, SIMD_T::add_epi32(vOutIndex
, SIMD_T::set1_epi32(1)), p_in
);
1111 typename
SIMD_T::Integer
ClipPrims(float *pVertices
, const typename
SIMD_T::Float
&vPrimMask
, const typename
SIMD_T::Float
&vClipMask
, int numAttribs
)
1114 float *pTempVerts
= reinterpret_cast<float *>(ClipHelper
<SIMD_T
>::GetTempVertices());
1116 // zero out num input verts for non-active lanes
1117 typename
SIMD_T::Integer vNumInPts
= SIMD_T::set1_epi32(NumVertsPerPrim
);
1118 vNumInPts
= SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts
, vClipMask
);
1120 // clip prims to frustum
1121 typename
SIMD_T::Integer vNumOutPts
;
1122 if (NumVertsPerPrim
== 3)
1124 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1125 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1126 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1127 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1128 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1129 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1133 SWR_ASSERT(NumVertsPerPrim
== 2);
1134 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1135 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1136 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1137 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1138 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1139 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1142 // restore num verts for non-clipped, active lanes
1143 typename
SIMD_T::Float vNonClippedMask
= SIMD_T::andnot_ps(vClipMask
, vPrimMask
);
1144 vNumOutPts
= SIMD_T::blendv_epi32(vNumOutPts
, SIMD_T::set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
1149 const uint32_t workerId
{ 0 };
1150 DRAW_CONTEXT
*pDC
{ nullptr };
1151 const API_STATE
&state
;
1152 typename
SIMD_T::Float clipCodes
[NumVertsPerPrim
];
1156 // pipeline stage functions
1157 void ClipTriangles(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
);
1158 void ClipLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
);
1159 void ClipPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari
const &primId
);
1160 #if USE_SIMD16_FRONTEND
1161 void SIMDCALL
ClipTriangles_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
);
1162 void SIMDCALL
ClipLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
);
1163 void SIMDCALL
ClipPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari
const &primId
);