1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices
[7];
40 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
41 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
42 #define CLIPCODE_SHIFT 23
43 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
44 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
45 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
46 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
48 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
49 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
51 NEGW
= (0x40 << CLIPCODE_SHIFT
),
53 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
54 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
55 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
56 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
59 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
62 void ComputeClipCodes(const API_STATE
& state
, const simdvector
& vertex
, simdscalar
& clipCodes
, simdscalari viewportIndexes
)
64 clipCodes
= _simd_setzero_ps();
67 simdscalar vNegW
= _simd_mul_ps(vertex
.w
, _simd_set1_ps(-1.0f
));
70 simdscalar vRes
= _simd_cmplt_ps(vertex
.x
, vNegW
);
71 clipCodes
= _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT
)));
74 vRes
= _simd_cmplt_ps(vertex
.y
, vNegW
);
75 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP
))));
78 vRes
= _simd_cmpgt_ps(vertex
.x
, vertex
.w
);
79 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT
))));
82 vRes
= _simd_cmpgt_ps(vertex
.y
, vertex
.w
);
83 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM
))));
85 if (state
.rastState
.depthClipEnable
)
88 // DX clips depth [0..w], GL clips [-w..w]
89 if (state
.rastState
.clipHalfZ
)
91 vRes
= _simd_cmplt_ps(vertex
.z
, _simd_setzero_ps());
95 vRes
= _simd_cmplt_ps(vertex
.z
, vNegW
);
97 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR
))));
100 vRes
= _simd_cmpgt_ps(vertex
.z
, vertex
.w
);
101 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR
))));
105 vRes
= _simd_cmple_ps(vertex
.w
, _simd_setzero_ps());
106 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(NEGW
))));
109 simdscalar gbMult
= _simd_mul_ps(vNegW
, _simd_i32gather_ps(&state
.gbState
.left
[0], viewportIndexes
, 4));
110 vRes
= _simd_cmplt_ps(vertex
.x
, gbMult
);
111 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT
))));
114 gbMult
= _simd_mul_ps(vNegW
, _simd_i32gather_ps(&state
.gbState
.top
[0], viewportIndexes
, 4));
115 vRes
= _simd_cmplt_ps(vertex
.y
, gbMult
);
116 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP
))));
119 gbMult
= _simd_mul_ps(vertex
.w
, _simd_i32gather_ps(&state
.gbState
.right
[0], viewportIndexes
, 4));
120 vRes
= _simd_cmpgt_ps(vertex
.x
, gbMult
);
121 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT
))));
124 gbMult
= _simd_mul_ps(vertex
.w
, _simd_i32gather_ps(&state
.gbState
.bottom
[0], viewportIndexes
, 4));
125 vRes
= _simd_cmpgt_ps(vertex
.y
, gbMult
);
126 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM
))));
129 template<uint32_t NumVertsPerPrim
>
133 Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
134 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
136 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
139 void ComputeClipCodes(simdvector vertex
[], simdscalari viewportIndexes
)
141 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
143 ::ComputeClipCodes(this->state
, vertex
[i
], this->clipCodes
[i
], viewportIndexes
);
147 simdscalar
ComputeClipCodeIntersection()
149 simdscalar result
= this->clipCodes
[0];
150 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
152 result
= _simd_and_ps(result
, this->clipCodes
[i
]);
157 simdscalar
ComputeClipCodeUnion()
159 simdscalar result
= this->clipCodes
[0];
160 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
162 result
= _simd_or_ps(result
, this->clipCodes
[i
]);
167 int ComputeNegWMask()
169 simdscalar clipCodeUnion
= ComputeClipCodeUnion();
170 clipCodeUnion
= _simd_and_ps(clipCodeUnion
, _simd_castsi_ps(_simd_set1_epi32(NEGW
)));
171 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion
, _simd_setzero_ps()));
174 int ComputeClipMask()
176 simdscalar clipUnion
= ComputeClipCodeUnion();
177 clipUnion
= _simd_and_ps(clipUnion
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK
)));
178 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion
, _simd_setzero_ps()));
181 // clipper is responsible for culling any prims with NAN coordinates
182 int ComputeNaNMask(simdvector prim
[])
184 simdscalar vNanMask
= _simd_setzero_ps();
185 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
187 simdscalar vNan01
= _simd_cmp_ps(prim
[e
].v
[0], prim
[e
].v
[1], _CMP_UNORD_Q
);
188 vNanMask
= _simd_or_ps(vNanMask
, vNan01
);
189 simdscalar vNan23
= _simd_cmp_ps(prim
[e
].v
[2], prim
[e
].v
[3], _CMP_UNORD_Q
);
190 vNanMask
= _simd_or_ps(vNanMask
, vNan23
);
193 return _simd_movemask_ps(vNanMask
);
196 int ComputeUserClipCullMask(PA_STATE
& pa
, simdvector prim
[])
198 uint8_t cullMask
= this->state
.rastState
.cullDistanceMask
;
199 simdscalar vClipCullMask
= _simd_setzero_ps();
202 simdvector vClipCullDistLo
[3];
203 simdvector vClipCullDistHi
[3];
205 pa
.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT
, vClipCullDistLo
);
206 pa
.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT
, vClipCullDistHi
);
207 while (_BitScanForward(&index
, cullMask
))
209 cullMask
&= ~(1 << index
);
210 uint32_t slot
= index
>> 2;
211 uint32_t component
= index
& 0x3;
213 simdscalar vCullMaskElem
= _simd_set1_ps(-1.0f
);
214 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
216 simdscalar vCullComp
;
219 vCullComp
= vClipCullDistLo
[e
][component
];
223 vCullComp
= vClipCullDistHi
[e
][component
];
226 // cull if cull distance < 0 || NAN
227 simdscalar vCull
= _simd_cmp_ps(_mm256_setzero_ps(), vCullComp
, _CMP_NLE_UQ
);
228 vCullMaskElem
= _simd_and_ps(vCullMaskElem
, vCull
);
230 vClipCullMask
= _simd_or_ps(vClipCullMask
, vCullMaskElem
);
233 // clipper should also discard any primitive with NAN clip distance
234 uint8_t clipMask
= this->state
.rastState
.clipDistanceMask
;
235 while (_BitScanForward(&index
, clipMask
))
237 clipMask
&= ~(1 << index
);
238 uint32_t slot
= index
>> 2;
239 uint32_t component
= index
& 0x3;
241 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
243 simdscalar vClipComp
;
246 vClipComp
= vClipCullDistLo
[e
][component
];
250 vClipComp
= vClipCullDistHi
[e
][component
];
253 simdscalar vClip
= _simd_cmp_ps(vClipComp
, vClipComp
, _CMP_UNORD_Q
);
254 vClipCullMask
= _simd_or_ps(vClipCullMask
, vClip
);
258 return _simd_movemask_ps(vClipCullMask
);
261 // clip SIMD primitives
262 void ClipSimd(const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, PA_STATE
& pa
, const simdscalari
& vPrimId
, const simdscalari
& vViewportIdx
)
264 // input/output vertex store for clipper
265 simdvertex vertices
[7]; // maximum 7 verts generated per triangle
267 LONG constantInterpMask
= this->state
.backendState
.constantInterpolationMask
;
268 uint32_t provokingVertex
= 0;
269 if(pa
.binTopology
== TOP_TRIANGLE_FAN
)
271 provokingVertex
= this->state
.frontendState
.provokingVertex
.triFan
;
273 ///@todo: line topology for wireframe?
276 simdvector tmpVector
[NumVertsPerPrim
];
277 pa
.Assemble(VERTEX_POSITION_SLOT
, tmpVector
);
278 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
280 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = tmpVector
[i
];
284 const SWR_BACKEND_STATE
& backendState
= this->state
.backendState
;
286 int32_t maxSlot
= -1;
287 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
289 // Compute absolute attrib slot in vertex array
290 uint32_t mapSlot
= backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
291 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
292 uint32_t inputSlot
= VERTEX_ATTRIB_START_SLOT
+ mapSlot
;
294 pa
.Assemble(inputSlot
, tmpVector
);
296 // if constant interpolation enabled for this attribute, assign the provoking
297 // vertex values to all edges
298 if (_bittest(&constantInterpMask
, slot
))
300 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
302 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
307 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
309 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
314 // assemble user clip distances if enabled
315 if (this->state
.rastState
.clipDistanceMask
& 0xf)
317 pa
.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT
, tmpVector
);
318 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
320 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
] = tmpVector
[i
];
324 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
326 pa
.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT
, tmpVector
);
327 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
329 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
] = tmpVector
[i
];
333 uint32_t numAttribs
= maxSlot
+ 1;
335 simdscalari vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
337 // set up new PA for binning clipped primitives
338 PFN_PROCESS_PRIMS pfnBinFunc
= nullptr;
339 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
340 if (NumVertsPerPrim
== 3)
342 pfnBinFunc
= GetBinTrianglesFunc((pa
.pDC
->pState
->state
.rastState
.conservativeRast
> 0));
343 clipTopology
= TOP_TRIANGLE_FAN
;
345 // so that the binner knows to bloat wide points later
346 if (pa
.binTopology
== TOP_POINT_LIST
)
347 clipTopology
= TOP_POINT_LIST
;
350 else if (NumVertsPerPrim
== 2)
352 pfnBinFunc
= BinLines
;
353 clipTopology
= TOP_LINE_LIST
;
357 SWR_ASSERT(0 && "Unexpected points in clipper.");
360 uint32_t* pVertexCount
= (uint32_t*)&vNumClippedVerts
;
361 uint32_t* pPrimitiveId
= (uint32_t*)&vPrimId
;
362 uint32_t* pViewportIdx
= (uint32_t*)&vViewportIdx
;
364 const simdscalari vOffsets
= _mm256_set_epi32(
365 0 * sizeof(simdvertex
), // unused lane
366 6 * sizeof(simdvertex
),
367 5 * sizeof(simdvertex
),
368 4 * sizeof(simdvertex
),
369 3 * sizeof(simdvertex
),
370 2 * sizeof(simdvertex
),
371 1 * sizeof(simdvertex
),
372 0 * sizeof(simdvertex
));
374 // only need to gather 7 verts
375 // @todo dynamic mask based on actual # of verts generated per lane
376 const simdscalar vMask
= _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
378 uint32_t numClippedPrims
= 0;
379 #if USE_SIMD16_FRONTEND
380 const uint32_t numPrims
= pa
.NumPrims();
381 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
383 SWR_ASSERT(numPrims
<= numPrims_lo
);
385 for (uint32_t inputPrim
= 0; inputPrim
< numPrims_lo
; ++inputPrim
)
387 for (uint32_t inputPrim
= 0; inputPrim
< pa
.NumPrims(); ++inputPrim
)
390 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
391 if (numEmittedVerts
< NumVertsPerPrim
)
395 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
397 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
398 numClippedPrims
+= numEmittedPrims
;
400 // tranpose clipper output so that each lane's vertices are in SIMD order
401 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
403 #if USE_SIMD16_FRONTEND
404 simd16vertex transposedPrims
[2];
406 simdvertex transposedPrims
[2];
410 uint8_t* pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
412 #if USE_SIMD16_FRONTEND
413 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug
414 static const float *dummy
= reinterpret_cast<const float *>(pBase
);
417 for (uint32_t c
= 0; c
< 4; ++c
)
419 #if USE_SIMD16_FRONTEND
420 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
421 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
423 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
425 pBase
+= sizeof(simdscalar
);
429 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_ATTRIB_START_SLOT
]) + sizeof(float) * inputPrim
;
430 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
432 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ attrib
;
433 for (uint32_t c
= 0; c
< 4; ++c
)
435 #if USE_SIMD16_FRONTEND
436 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
437 transposedPrims
[0].attrib
[attribSlot
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
439 transposedPrims
[0].attrib
[attribSlot
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
441 pBase
+= sizeof(simdscalar
);
445 // transpose user clip distances if enabled
446 if (this->state
.rastState
.clipDistanceMask
& 0xf)
448 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
]) + sizeof(float) * inputPrim
;
449 for (uint32_t c
= 0; c
< 4; ++c
)
451 #if USE_SIMD16_FRONTEND
452 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
453 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
455 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
457 pBase
+= sizeof(simdscalar
);
461 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
463 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
]) + sizeof(float) * inputPrim
;
464 for (uint32_t c
= 0; c
< 4; ++c
)
466 #if USE_SIMD16_FRONTEND
467 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
468 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
470 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
472 pBase
+= sizeof(simdscalar
);
476 PA_STATE_OPT
clipPa(this->pDC
, numEmittedPrims
, (uint8_t*)&transposedPrims
[0], numEmittedVerts
, true, clipTopology
);
478 while (clipPa
.GetNextStreamOutput())
482 #if USE_SIMD16_FRONTEND
483 simd16vector attrib_simd16
[NumVertsPerPrim
];
484 bool assemble
= clipPa
.Assemble_simd16(VERTEX_POSITION_SLOT
, attrib_simd16
);
488 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
490 simdvector attrib
[NumVertsPerPrim
];
491 for (uint32_t i
= 0; i
< NumVertsPerPrim
; i
+= 1)
493 for (uint32_t j
= 0; j
< 4; j
+= 1)
495 attrib
[i
][j
] = _simd16_extract_ps(attrib_simd16
[i
][j
], 0);
499 clipPa
.useAlternateOffset
= false;
500 pfnBinFunc(this->pDC
, clipPa
, this->workerId
, attrib
, primMaskMap
[numEmittedPrims
], _simd_set1_epi32(pPrimitiveId
[inputPrim
]), _simd_set1_epi32(pViewportIdx
[inputPrim
]));
503 simdvector attrib
[NumVertsPerPrim
];
504 bool assemble
= clipPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
507 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
508 pfnBinFunc(this->pDC
, clipPa
, this->workerId
, attrib
, primMaskMap
[numEmittedPrims
], _simd_set1_epi32(pPrimitiveId
[inputPrim
]), _simd_set1_epi32(pViewportIdx
[inputPrim
]));
511 } while (clipPa
.NextPrim());
515 // update global pipeline stat
516 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
519 // execute the clipper stage
520 void ExecuteStage(PA_STATE
& pa
, simdvector prim
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
)
522 SWR_ASSERT(pa
.pDC
!= nullptr);
523 SWR_CONTEXT
* pContext
= pa
.pDC
->pContext
;
525 // set up binner based on PA state
526 PFN_PROCESS_PRIMS pfnBinner
;
527 switch (pa
.binTopology
)
530 pfnBinner
= BinPoints
;
535 case TOP_LINE_LIST_ADJ
:
536 case TOP_LISTSTRIP_ADJ
:
537 pfnBinner
= BinLines
;
540 pfnBinner
= GetBinTrianglesFunc((pa
.pDC
->pState
->state
.rastState
.conservativeRast
> 0));
544 // update clipper invocations pipeline stat
545 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
546 UPDATE_STAT_FE(CInvocations
, numInvoc
);
548 ComputeClipCodes(prim
, viewportIdx
);
550 // cull prims with NAN coords
551 primMask
&= ~ComputeNaNMask(prim
);
553 // user cull distance cull
554 if (this->state
.rastState
.cullDistanceMask
)
556 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
559 // cull prims outside view frustum
560 simdscalar clipIntersection
= ComputeClipCodeIntersection();
561 int validMask
= primMask
& _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection
, _simd_setzero_ps()));
563 // skip clipping for points
564 uint32_t clipMask
= 0;
565 if (NumVertsPerPrim
!= 1)
567 clipMask
= primMask
& ComputeClipMask();
572 AR_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
573 // we have to clip tris, execute the clipper, which will also
575 ClipSimd(vMask(primMask
), vMask(clipMask
), pa
, primId
, viewportIdx
);
576 AR_END(FEGuardbandClip
, 1);
580 // update CPrimitives pipeline state
581 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
583 // forward valid prims directly to binner
584 pfnBinner(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
, viewportIdx
);
589 inline simdscalar
ComputeInterpFactor(simdscalar boundaryCoord0
, simdscalar boundaryCoord1
)
591 return _simd_div_ps(boundaryCoord0
, _simd_sub_ps(boundaryCoord0
, boundaryCoord1
));
594 inline simdscalari
ComputeOffsets(uint32_t attrib
, simdscalari vIndices
, uint32_t component
)
596 const uint32_t simdVertexStride
= sizeof(simdvertex
);
597 const uint32_t componentStride
= sizeof(simdscalar
);
598 const uint32_t attribStride
= sizeof(simdvector
);
599 const __m256i vElemOffset
= _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
600 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
602 // step to the simdvertex
603 simdscalari vOffsets
= _simd_mullo_epi32(vIndices
, _simd_set1_epi32(simdVertexStride
));
605 // step to the attribute and component
606 vOffsets
= _simd_add_epi32(vOffsets
, _simd_set1_epi32(attribStride
* attrib
+ componentStride
* component
));
609 vOffsets
= _simd_add_epi32(vOffsets
, vElemOffset
);
614 // gathers a single component for a given attribute for each SIMD lane
615 inline simdscalar
GatherComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
)
617 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
618 simdscalar vSrc
= _mm256_undefined_ps();
619 return _simd_mask_i32gather_ps(vSrc
, pBuffer
, vOffsets
, vMask
, 1);
622 inline void ScatterComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
, simdscalar vSrc
)
624 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
626 uint32_t* pOffsets
= (uint32_t*)&vOffsets
;
627 float* pSrc
= (float*)&vSrc
;
628 uint32_t mask
= _simd_movemask_ps(vMask
);
630 while (_BitScanForward(&lane
, mask
))
632 mask
&= ~(1 << lane
);
633 uint8_t* pBuf
= (uint8_t*)pBuffer
+ pOffsets
[lane
];
634 *(float*)pBuf
= pSrc
[lane
];
638 template<SWR_CLIPCODES ClippingPlane
>
639 inline void intersect(
640 const simdscalar
& vActiveMask
, // active lanes to operate on
641 const simdscalari
& s
, // index to first edge vertex v0 in pInPts.
642 const simdscalari
& p
, // index to second edge vertex v1 in pInPts.
643 const simdvector
& v1
, // vertex 0 position
644 const simdvector
& v2
, // vertex 1 position
645 simdscalari
& outIndex
, // output index.
646 const float *pInVerts
, // array of all the input positions.
647 uint32_t numInAttribs
, // number of attributes per vertex.
648 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
650 // compute interpolation factor
652 switch (ClippingPlane
)
654 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[0]), _simd_add_ps(v2
[3], v2
[0])); break;
655 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[0]), _simd_sub_ps(v2
[3], v2
[0])); break;
656 case FRUSTUM_TOP
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[1]), _simd_add_ps(v2
[3], v2
[1])); break;
657 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[1]), _simd_sub_ps(v2
[3], v2
[1])); break;
659 // DX Znear plane is 0, GL is -w
660 if (this->state
.rastState
.clipHalfZ
)
662 t
= ComputeInterpFactor(v1
[2], v2
[2]);
666 t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[2]), _simd_add_ps(v2
[3], v2
[2]));
669 case FRUSTUM_FAR
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[2]), _simd_sub_ps(v2
[3], v2
[2])); break;
670 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
673 // interpolate position and store
674 for (uint32_t c
= 0; c
< 4; ++c
)
676 simdscalar vOutPos
= _simd_fmadd_ps(_simd_sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
677 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
680 // interpolate attributes and store
681 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
683 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
684 for (uint32_t c
= 0; c
< 4; ++c
)
686 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
687 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
688 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
689 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
693 // interpolate clip distance if enabled
694 if (this->state
.rastState
.clipDistanceMask
& 0xf)
696 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
697 for (uint32_t c
= 0; c
< 4; ++c
)
699 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
700 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
701 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
702 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
706 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
708 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
709 for (uint32_t c
= 0; c
< 4; ++c
)
711 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
712 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
713 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
714 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
719 template<SWR_CLIPCODES ClippingPlane
>
720 inline simdscalar
inside(const simdvector
& v
)
722 switch (ClippingPlane
)
724 case FRUSTUM_LEFT
: return _simd_cmpge_ps(v
[0], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
725 case FRUSTUM_RIGHT
: return _simd_cmple_ps(v
[0], v
[3]);
726 case FRUSTUM_TOP
: return _simd_cmpge_ps(v
[1], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
727 case FRUSTUM_BOTTOM
: return _simd_cmple_ps(v
[1], v
[3]);
728 case FRUSTUM_NEAR
: return _simd_cmpge_ps(v
[2], this->state
.rastState
.clipHalfZ
? _simd_setzero_ps() : _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
729 case FRUSTUM_FAR
: return _simd_cmple_ps(v
[2], v
[3]);
731 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
732 return _simd_setzero_ps();
736 template<SWR_CLIPCODES ClippingPlane
>
737 simdscalari
ClipTriToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
739 simdscalari vCurIndex
= _simd_setzero_si();
740 simdscalari vOutIndex
= _simd_setzero_si();
741 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
743 while (!_simd_testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
745 simdscalari s
= vCurIndex
;
746 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
747 simdscalari underFlowMask
= _simd_cmpgt_epi32(vNumInPts
, p
);
748 p
= _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p
), _simd_castsi_ps(underFlowMask
)));
751 simdvector vInPos0
, vInPos1
;
752 for (uint32_t c
= 0; c
< 4; ++c
)
754 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
755 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
758 // compute inside mask
759 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
760 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
762 // compute intersection mask (s_in != p_in)
763 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
764 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
767 s_in
= _simd_and_ps(s_in
, vActiveMask
);
768 if (!_simd_testz_ps(s_in
, s_in
))
771 for (uint32_t c
= 0; c
< 4; ++c
)
773 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
777 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
779 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
780 for (uint32_t c
= 0; c
< 4; ++c
)
782 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
783 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
787 // store clip distance if enabled
788 if (this->state
.rastState
.clipDistanceMask
& 0xf)
790 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
791 for (uint32_t c
= 0; c
< 4; ++c
)
793 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
794 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
798 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
800 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
801 for (uint32_t c
= 0; c
< 4; ++c
)
803 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
804 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
808 // increment outIndex
809 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
812 // compute and store intersection
813 if (!_simd_testz_ps(intersectMask
, intersectMask
))
815 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
817 // increment outIndex for active lanes
818 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
821 // increment loop index and update active mask
822 vCurIndex
= _simd_add_epi32(vCurIndex
, _simd_set1_epi32(1));
823 vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
829 template<SWR_CLIPCODES ClippingPlane
>
830 simdscalari
ClipLineToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
832 simdscalari vCurIndex
= _simd_setzero_si();
833 simdscalari vOutIndex
= _simd_setzero_si();
834 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
836 if (!_simd_testz_ps(vActiveMask
, vActiveMask
))
838 simdscalari s
= vCurIndex
;
839 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
842 simdvector vInPos0
, vInPos1
;
843 for (uint32_t c
= 0; c
< 4; ++c
)
845 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
846 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
849 // compute inside mask
850 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
851 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
853 // compute intersection mask (s_in != p_in)
854 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
855 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
858 s_in
= _simd_and_ps(s_in
, vActiveMask
);
859 if (!_simd_testz_ps(s_in
, s_in
))
861 for (uint32_t c
= 0; c
< 4; ++c
)
863 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
866 // interpolate attributes and store
867 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
869 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
870 for (uint32_t c
= 0; c
< 4; ++c
)
872 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
873 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
877 // increment outIndex
878 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
881 // compute and store intersection
882 if (!_simd_testz_ps(intersectMask
, intersectMask
))
884 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
886 // increment outIndex for active lanes
887 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
891 p_in
= _simd_and_ps(p_in
, vActiveMask
);
892 if (!_simd_testz_ps(p_in
, p_in
))
894 for (uint32_t c
= 0; c
< 4; ++c
)
896 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
899 // interpolate attributes and store
900 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
902 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ a
;
903 for (uint32_t c
= 0; c
< 4; ++c
)
905 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
906 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
910 // increment outIndex
911 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), p_in
);
918 //////////////////////////////////////////////////////////////////////////
919 /// @brief Vertical clipper. Clips SIMD primitives at a time
920 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
921 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
922 /// @param numAttribs - number of valid input attribs, including position
923 simdscalari
ClipPrims(float* pVertices
, const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, int numAttribs
)
926 float* pTempVerts
= (float*)&tlsTempVertices
[0];
928 // zero out num input verts for non-active lanes
929 simdscalari vNumInPts
= _simd_set1_epi32(NumVertsPerPrim
);
930 vNumInPts
= _simd_blendv_epi32(_simd_setzero_si(), vNumInPts
, vClipMask
);
932 // clip prims to frustum
933 simdscalari vNumOutPts
;
934 if (NumVertsPerPrim
== 3)
936 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
937 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
938 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
939 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
940 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
941 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
945 SWR_ASSERT(NumVertsPerPrim
== 2);
946 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
947 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
948 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
949 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
950 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
951 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
954 // restore num verts for non-clipped, active lanes
955 simdscalar vNonClippedMask
= _simd_andnot_ps(vClipMask
, vPrimMask
);
956 vNumOutPts
= _simd_blendv_epi32(vNumOutPts
, _simd_set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
961 const uint32_t workerId
{ 0 };
962 DRAW_CONTEXT
* pDC
{ nullptr };
963 const API_STATE
& state
;
964 simdscalar clipCodes
[NumVertsPerPrim
];
968 // pipeline stage functions
969 void ClipTriangles(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
);
970 void ClipLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
);
971 void ClipPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
, simdscalari viewportIdx
);