swr/rast: Faster frustum prim culling
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
39 #endif
40
41 enum SWR_CLIPCODES
42 {
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
47 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
48 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
49 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
50
51 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
52 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
53
54 NEGW = (0x40 << CLIPCODE_SHIFT),
55
56 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
57 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
58 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
59 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
60 };
61
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63 #define FRUSTUM_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|FRUSTUM_LEFT|FRUSTUM_RIGHT|FRUSTUM_TOP|FRUSTUM_BOTTOM)
64
65 template<typename SIMD_T>
66 void ComputeClipCodes(const API_STATE &state, const Vec4<SIMD_T> &vertex, Float<SIMD_T> &clipCodes, Integer<SIMD_T> const &viewportIndexes)
67 {
68 clipCodes = SIMD_T::setzero_ps();
69
70 // -w
71 Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
72
73 // FRUSTUM_LEFT
74 Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
75 clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
76
77 // FRUSTUM_TOP
78 vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
79 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
80
81 // FRUSTUM_RIGHT
82 vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
83 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
84
85 // FRUSTUM_BOTTOM
86 vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
87 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
88
89 if (state.rastState.depthClipEnable)
90 {
91 // FRUSTUM_NEAR
92 // DX clips depth [0..w], GL clips [-w..w]
93 if (state.rastState.clipHalfZ)
94 {
95 vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
96 }
97 else
98 {
99 vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
100 }
101 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
102
103 // FRUSTUM_FAR
104 vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
105 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
106 }
107
108 // NEGW
109 vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
110 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
111
112 // GUARDBAND_LEFT
113 Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.left[0], viewportIndexes));
114 vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
115 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
116
117 // GUARDBAND_TOP
118 gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.top[0], viewportIndexes));
119 vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
120 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
121
122 // GUARDBAND_RIGHT
123 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.right[0], viewportIndexes));
124 vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
125 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
126
127 // GUARDBAND_BOTTOM
128 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.bottom[0], viewportIndexes));
129 vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
130 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
131 }
132
133 template<typename SIMD_T>
134 struct BinnerChooser
135 {
136 };
137
138 template<>
139 struct BinnerChooser<SIMD256>
140 {
141 PFN_PROCESS_PRIMS pfnBinFunc;
142
143 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
144 :pfnBinFunc(nullptr)
145 {
146 if (numVertsPerPrim == 3)
147 {
148 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
149
150 }
151 else if (numVertsPerPrim == 2)
152 {
153 pfnBinFunc = BinLines;
154 }
155 else
156 {
157 SWR_ASSERT(0 && "Unexpected points in clipper.");
158 }
159 }
160
161 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
162 :pfnBinFunc(nullptr)
163 {
164 switch (topology)
165 {
166 case TOP_POINT_LIST:
167 pfnBinFunc = BinPoints;
168 break;
169 case TOP_LINE_LIST:
170 case TOP_LINE_STRIP:
171 case TOP_LINE_LOOP:
172 case TOP_LINE_LIST_ADJ:
173 case TOP_LISTSTRIP_ADJ:
174 pfnBinFunc = BinLines;
175 break;
176 default:
177 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
178 break;
179 };
180 }
181
182 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
183 {
184 SWR_ASSERT(pfnBinFunc != nullptr);
185
186 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
187 }
188 };
189
190 #if USE_SIMD16_FRONTEND
191 template<>
192 struct BinnerChooser<SIMD512>
193 {
194 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
195
196 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
197 :pfnBinFunc(nullptr)
198 {
199 if (numVertsPerPrim == 3)
200 {
201 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
202
203 }
204 else if (numVertsPerPrim == 2)
205 {
206 pfnBinFunc = BinLines_simd16;
207 }
208 else
209 {
210 SWR_ASSERT(0 && "Unexpected points in clipper.");
211 }
212 }
213
214 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
215 :pfnBinFunc(nullptr)
216 {
217 switch (topology)
218 {
219 case TOP_POINT_LIST:
220 pfnBinFunc = BinPoints_simd16;
221 break;
222 case TOP_LINE_LIST:
223 case TOP_LINE_STRIP:
224 case TOP_LINE_LOOP:
225 case TOP_LINE_LIST_ADJ:
226 case TOP_LISTSTRIP_ADJ:
227 pfnBinFunc = BinLines_simd16;
228 break;
229 default:
230 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
231 break;
232 };
233 }
234
235 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
236 {
237 SWR_ASSERT(pfnBinFunc != nullptr);
238
239 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
240 }
241 };
242
243 #endif
244 template<typename SIMD_T>
245 struct SimdHelper
246 {
247 };
248
249 template<>
250 struct SimdHelper<SIMD256>
251 {
252 static SIMD256::Float insert_lo_ps(SIMD256::Float a)
253 {
254 return a;
255 }
256
257 static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
258 {
259 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
260 }
261 };
262
263 #if USE_SIMD16_FRONTEND
264 template<>
265 struct SimdHelper<SIMD512>
266 {
267 static SIMD512::Float insert_lo_ps(SIMD256::Float a)
268 {
269 return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
270 }
271
272 static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
273 {
274 return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
275 }
276 };
277
278 #endif
279 // Temp storage used by the clipper
280 template<typename SIMD_T>
281 struct ClipHelper
282 {
283 };
284
285 template<>
286 struct ClipHelper<SIMD256>
287 {
288 static SIMDVERTEX_T<SIMD256> *GetTempVertices()
289 {
290 return tlsTempVertices;
291 }
292 };
293
294 #if USE_SIMD16_FRONTEND
295 template<>
296 struct ClipHelper<SIMD512>
297 {
298 static SIMDVERTEX_T<SIMD512> *GetTempVertices()
299 {
300 return tlsTempVertices_simd16;
301 }
302 };
303
304 #endif
305 template<typename SIMD_T, uint32_t NumVertsPerPrim>
306 class Clipper
307 {
308 public:
309 INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
310 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
311 {
312 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
313 }
314
315 void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T> &viewportIndexes)
316 {
317 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
318 {
319 ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
320 }
321 }
322
323 Float<SIMD_T> ComputeClipCodeIntersection()
324 {
325 Float<SIMD_T> result = clipCodes[0];
326
327 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
328 {
329 result = SIMD_T::and_ps(result, clipCodes[i]);
330 }
331
332 return result;
333 }
334
335 Float<SIMD_T> ComputeClipCodeUnion()
336 {
337 Float<SIMD_T> result = clipCodes[0];
338
339 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
340 {
341 result = SIMD_T::or_ps(result, clipCodes[i]);
342 }
343
344 return result;
345 }
346
347 int ComputeClipMask()
348 {
349 Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
350
351 clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
352
353 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
354 }
355
356 // clipper is responsible for culling any prims with NAN coordinates
357 int ComputeNaNMask(Vec4<SIMD_T> prim[])
358 {
359 Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
360
361 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
362 {
363 Float<SIMD_T> vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
364 vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
365
366 Float<SIMD_T> vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
367 vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
368 }
369
370 return SIMD_T::movemask_ps(vNanMask);
371 }
372
373 int ComputeUserClipCullMask(PA_STATE &pa, Vec4<SIMD_T> prim[])
374 {
375 uint8_t cullMask = state.backendState.cullDistanceMask;
376 uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
377
378 Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
379
380 Vec4<SIMD_T> vClipCullDistLo[3];
381 Vec4<SIMD_T> vClipCullDistHi[3];
382
383 pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
384 pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
385
386 DWORD index;
387 while (_BitScanForward(&index, cullMask))
388 {
389 cullMask &= ~(1 << index);
390 uint32_t slot = index >> 2;
391 uint32_t component = index & 0x3;
392
393 Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
394 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
395 {
396 Float<SIMD_T> vCullComp;
397 if (slot == 0)
398 {
399 vCullComp = vClipCullDistLo[e][component];
400 }
401 else
402 {
403 vCullComp = vClipCullDistHi[e][component];
404 }
405
406 // cull if cull distance < 0 || NAN
407 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
408 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
409 }
410 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
411 }
412
413 // clipper should also discard any primitive with NAN clip distance
414 uint8_t clipMask = state.backendState.clipDistanceMask;
415 while (_BitScanForward(&index, clipMask))
416 {
417 clipMask &= ~(1 << index);
418 uint32_t slot = index >> 2;
419 uint32_t component = index & 0x3;
420
421 Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
422 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
423 {
424 Float<SIMD_T> vClipComp;
425 if (slot == 0)
426 {
427 vClipComp = vClipCullDistLo[e][component];
428 }
429 else
430 {
431 vClipComp = vClipCullDistHi[e][component];
432 }
433
434 Float<SIMD_T> vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
435 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp);
436 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
437 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
438 }
439 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
440 }
441
442 return SIMD_T::movemask_ps(vClipCullMask);
443 }
444
445 void ClipSimd(const Vec4<SIMD_T> prim[], const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, PA_STATE &pa,
446 const Integer<SIMD_T> &vPrimId, const Integer<SIMD_T> &vViewportIdx, const Integer<SIMD_T> &vRtIdx)
447 {
448 // input/output vertex store for clipper
449 SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
450
451 uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
452 uint32_t provokingVertex = 0;
453 if (pa.binTopology == TOP_TRIANGLE_FAN)
454 {
455 provokingVertex = state.frontendState.provokingVertex.triFan;
456 }
457 ///@todo: line topology for wireframe?
458
459 // assemble pos
460 Vec4<SIMD_T> tmpVector[NumVertsPerPrim];
461 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
462 {
463 vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
464 }
465
466 // assemble attribs
467 const SWR_BACKEND_STATE& backendState = state.backendState;
468
469 int32_t maxSlot = -1;
470 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
471 {
472 // Compute absolute attrib slot in vertex array
473 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
474 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
475 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
476
477 pa.Assemble(inputSlot, tmpVector);
478
479 // if constant interpolation enabled for this attribute, assign the provoking
480 // vertex values to all edges
481 if (CheckBit(constantInterpMask, slot))
482 {
483 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
484 {
485 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
486 }
487 }
488 else
489 {
490 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
491 {
492 vertices[i].attrib[inputSlot] = tmpVector[i];
493 }
494 }
495 }
496
497 // assemble user clip distances if enabled
498 uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
499 if (state.backendState.clipDistanceMask & 0xf)
500 {
501 pa.Assemble(vertexClipCullSlot, tmpVector);
502 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
503 {
504 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
505 }
506 }
507
508 if (state.backendState.clipDistanceMask & 0xf0)
509 {
510 pa.Assemble(vertexClipCullSlot + 1, tmpVector);
511 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
512 {
513 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
514 }
515 }
516
517 uint32_t numAttribs = maxSlot + 1;
518
519 Integer<SIMD_T> vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
520
521 BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
522
523 // set up new PA for binning clipped primitives
524 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
525 if (NumVertsPerPrim == 3)
526 {
527 clipTopology = TOP_TRIANGLE_FAN;
528
529 // so that the binner knows to bloat wide points later
530 if (pa.binTopology == TOP_POINT_LIST)
531 {
532 clipTopology = TOP_POINT_LIST;
533 }
534 }
535 else if (NumVertsPerPrim == 2)
536 {
537 clipTopology = TOP_LINE_LIST;
538 }
539 else
540 {
541 SWR_ASSERT(0 && "Unexpected points in clipper.");
542 }
543
544 const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
545 const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
546 const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
547 const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
548
549 const SIMD256::Integer vOffsets = SIMD256::set_epi32(
550 0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
551 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
552 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
553 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
554 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
555 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
556 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
557 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
558
559 // only need to gather 7 verts
560 // @todo dynamic mask based on actual # of verts generated per lane
561 const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
562
563 uint32_t numClippedPrims = 0;
564
565 // tranpose clipper output so that each lane's vertices are in SIMD order
566 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
567 // for triangle fan
568
569 #if defined(_DEBUG)
570 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
571 SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
572
573 #else
574 SIMDVERTEX_T<SIMD_T> transposedPrims[2];
575
576 #endif
577 uint32_t numInputPrims = pa.NumPrims();
578 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
579 {
580 uint32_t numEmittedVerts = pVertexCount[inputPrim];
581 if (numEmittedVerts < NumVertsPerPrim)
582 {
583 continue;
584 }
585 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
586
587 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
588 SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
589
590 numClippedPrims += numEmittedPrims;
591
592 // tranpose clipper output so that each lane's vertices are in SIMD order
593 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
594 // for triangle fan
595
596 // transpose pos
597 uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
598
599 #if 0
600 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
601 static const float *dummy = reinterpret_cast<const float *>(pBase);
602
603 #endif
604 for (uint32_t c = 0; c < 4; ++c)
605 {
606 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
607 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
608 pBase += sizeof(Float<SIMD_T>);
609 }
610
611 // transpose attribs
612 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
613
614 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
615 {
616 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
617
618 for (uint32_t c = 0; c < 4; ++c)
619 {
620 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
621 transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
622 pBase += sizeof(Float<SIMD_T>);
623 }
624 }
625
626 // transpose user clip distances if enabled
627 uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
628 if (state.backendState.clipDistanceMask & 0x0f)
629 {
630 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
631
632 for (uint32_t c = 0; c < 4; ++c)
633 {
634 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
635 transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
636 pBase += sizeof(Float<SIMD_T>);
637 }
638 }
639
640 if (state.backendState.clipDistanceMask & 0xf0)
641 {
642 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
643
644 for (uint32_t c = 0; c < 4; ++c)
645 {
646 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
647 transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
648 pBase += sizeof(Float<SIMD_T>);
649 }
650 }
651
652 PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
653 clipPA.viewportArrayActive = pa.viewportArrayActive;
654 clipPA.rtArrayActive = pa.rtArrayActive;
655
656 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
657
658 const uint32_t primMask = primMaskMap[numEmittedPrims];
659
660 const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
661 const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
662 const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
663
664
665 while (clipPA.GetNextStreamOutput())
666 {
667 do
668 {
669 Vec4<SIMD_T> attrib[NumVertsPerPrim];
670
671 bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
672
673 if (assemble)
674 {
675 binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
676 }
677
678 } while (clipPA.NextPrim());
679 }
680 }
681
682 #if defined(_DEBUG)
683 AlignedFree(transposedPrims);
684
685 #endif
686 // update global pipeline stat
687 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
688 }
689
690 void ExecuteStage(PA_STATE &pa, Vec4<SIMD_T> prim[], uint32_t primMask,
691 Integer<SIMD_T> const &primId, Integer<SIMD_T> const &viewportIdx, Integer<SIMD_T> const &rtIdx)
692 {
693 SWR_ASSERT(pa.pDC != nullptr);
694
695 BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
696
697 // update clipper invocations pipeline stat
698 uint32_t numInvoc = _mm_popcnt_u32(primMask);
699 UPDATE_STAT_FE(CInvocations, numInvoc);
700
701 ComputeClipCodes(prim, viewportIdx);
702
703 // cull prims with NAN coords
704 primMask &= ~ComputeNaNMask(prim);
705
706 // user cull distance cull
707 if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
708 {
709 primMask &= ~ComputeUserClipCullMask(pa, prim);
710 }
711
712 Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
713 // Mask out non-frustum codes
714 clipIntersection = SIMD_T::and_ps(clipIntersection, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
715
716 // cull prims outside view frustum
717 int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
718
719 // skip clipping for points
720 uint32_t clipMask = 0;
721 if (NumVertsPerPrim != 1)
722 {
723 clipMask = validMask & ComputeClipMask();
724 }
725
726 AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
727
728 if (clipMask)
729 {
730 RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
731 // we have to clip tris, execute the clipper, which will also
732 // call the binner
733 ClipSimd(prim, SIMD_T::vmask_ps(validMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
734 RDTSC_END(FEGuardbandClip, 1);
735 }
736 else if (validMask)
737 {
738 // update CPrimitives pipeline state
739 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
740
741 // forward valid prims directly to binner
742 binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
743 }
744 }
745
746 private:
747 Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const &boundaryCoord0, Float<SIMD_T> const &boundaryCoord1)
748 {
749 return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
750 }
751
752 Integer<SIMD_T> ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const &vIndices, uint32_t component)
753 {
754 const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
755 const uint32_t componentStride = sizeof(Float<SIMD_T>);
756 const uint32_t attribStride = sizeof(Vec4<SIMD_T>);
757
758 static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
759 {
760 0 * sizeof(float),
761 1 * sizeof(float),
762 2 * sizeof(float),
763 3 * sizeof(float),
764 4 * sizeof(float),
765 5 * sizeof(float),
766 6 * sizeof(float),
767 7 * sizeof(float),
768 8 * sizeof(float),
769 9 * sizeof(float),
770 10 * sizeof(float),
771 11 * sizeof(float),
772 12 * sizeof(float),
773 13 * sizeof(float),
774 14 * sizeof(float),
775 15 * sizeof(float),
776 };
777
778 static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
779
780 Integer<SIMD_T> vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T> *>(elemOffset));
781
782 // step to the simdvertex
783 Integer<SIMD_T> vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
784
785 // step to the attribute and component
786 vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
787
788 // step to the lane
789 vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
790
791 return vOffsets;
792 }
793
794 Float<SIMD_T> GatherComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component)
795 {
796 Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
797 Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
798
799 return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(vSrc, pBuffer, vOffsets, vMask);
800 }
801
802 void ScatterComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component, Float<SIMD_T> const &vSrc)
803 {
804 Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
805
806 const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
807 const float *pSrc = reinterpret_cast<const float *>(&vSrc);
808 uint32_t mask = SIMD_T::movemask_ps(vMask);
809 DWORD lane;
810 while (_BitScanForward(&lane, mask))
811 {
812 mask &= ~(1 << lane);
813 const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
814 *(float *)pBuf = pSrc[lane];
815 }
816 }
817
818 template<SWR_CLIPCODES ClippingPlane>
819 void intersect(
820 const Float<SIMD_T> &vActiveMask, // active lanes to operate on
821 const Integer<SIMD_T> &s, // index to first edge vertex v0 in pInPts.
822 const Integer<SIMD_T> &p, // index to second edge vertex v1 in pInPts.
823 const Vec4<SIMD_T> &v1, // vertex 0 position
824 const Vec4<SIMD_T> &v2, // vertex 1 position
825 Integer<SIMD_T> &outIndex, // output index.
826 const float *pInVerts, // array of all the input positions.
827 uint32_t numInAttribs, // number of attributes per vertex.
828 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
829 {
830 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
831 uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
832
833 // compute interpolation factor
834 Float<SIMD_T> t;
835 switch (ClippingPlane)
836 {
837 case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
838 case FRUSTUM_RIGHT: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
839 case FRUSTUM_TOP: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
840 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
841 case FRUSTUM_NEAR:
842 // DX Znear plane is 0, GL is -w
843 if (this->state.rastState.clipHalfZ)
844 {
845 t = ComputeInterpFactor(v1[2], v2[2]);
846 }
847 else
848 {
849 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
850 }
851 break;
852 case FRUSTUM_FAR: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
853 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
854 };
855
856 // interpolate position and store
857 for (uint32_t c = 0; c < 4; ++c)
858 {
859 Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
860 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
861 }
862
863 // interpolate attributes and store
864 for (uint32_t a = 0; a < numInAttribs; ++a)
865 {
866 uint32_t attribSlot = vertexAttribOffset + a;
867 for (uint32_t c = 0; c < 4; ++c)
868 {
869 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
870 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
871 Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
872 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
873 }
874 }
875
876 // interpolate clip distance if enabled
877 if (this->state.backendState.clipDistanceMask & 0xf)
878 {
879 uint32_t attribSlot = vertexClipCullOffset;
880 for (uint32_t c = 0; c < 4; ++c)
881 {
882 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
883 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
884 Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
885 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
886 }
887 }
888
889 if (this->state.backendState.clipDistanceMask & 0xf0)
890 {
891 uint32_t attribSlot = vertexClipCullOffset + 1;
892 for (uint32_t c = 0; c < 4; ++c)
893 {
894 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
895 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
896 Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
897 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
898 }
899 }
900 }
901
902 template<SWR_CLIPCODES ClippingPlane>
903 Float<SIMD_T> inside(const Vec4<SIMD_T> &v)
904 {
905 switch (ClippingPlane)
906 {
907 case FRUSTUM_LEFT: return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
908 case FRUSTUM_RIGHT: return SIMD_T::cmple_ps(v[0], v[3]);
909 case FRUSTUM_TOP: return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
910 case FRUSTUM_BOTTOM: return SIMD_T::cmple_ps(v[1], v[3]);
911 case FRUSTUM_NEAR: return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
912 case FRUSTUM_FAR: return SIMD_T::cmple_ps(v[2], v[3]);
913 default:
914 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
915 return SIMD_T::setzero_ps();
916 }
917 }
918
919 template<SWR_CLIPCODES ClippingPlane>
920 Integer<SIMD_T> ClipTriToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
921 {
922 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
923
924 Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
925 Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
926 Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
927
928 while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
929 {
930 Integer<SIMD_T> s = vCurIndex;
931 Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
932 Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
933 p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
934
935 // gather position
936 Vec4<SIMD_T> vInPos0, vInPos1;
937 for (uint32_t c = 0; c < 4; ++c)
938 {
939 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
940 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
941 }
942
943 // compute inside mask
944 Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
945 Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
946
947 // compute intersection mask (s_in != p_in)
948 Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
949 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
950
951 // store s if inside
952 s_in = SIMD_T::and_ps(s_in, vActiveMask);
953 if (!SIMD_T::testz_ps(s_in, s_in))
954 {
955 // store position
956 for (uint32_t c = 0; c < 4; ++c)
957 {
958 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
959 }
960
961 // store attribs
962 for (uint32_t a = 0; a < numInAttribs; ++a)
963 {
964 uint32_t attribSlot = vertexAttribOffset + a;
965 for (uint32_t c = 0; c < 4; ++c)
966 {
967 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
968 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
969 }
970 }
971
972 // store clip distance if enabled
973 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
974 if (this->state.backendState.clipDistanceMask & 0xf)
975 {
976 uint32_t attribSlot = vertexClipCullSlot;
977 for (uint32_t c = 0; c < 4; ++c)
978 {
979 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
980 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
981 }
982 }
983
984 if (this->state.backendState.clipDistanceMask & 0xf0)
985 {
986 uint32_t attribSlot = vertexClipCullSlot + 1;
987 for (uint32_t c = 0; c < 4; ++c)
988 {
989 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
990 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
991 }
992 }
993
994 // increment outIndex
995 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
996 }
997
998 // compute and store intersection
999 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1000 {
1001 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1002
1003 // increment outIndex for active lanes
1004 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1005 }
1006
1007 // increment loop index and update active mask
1008 vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1009 vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1010 }
1011
1012 return vOutIndex;
1013 }
1014
1015 template<SWR_CLIPCODES ClippingPlane>
1016 Integer<SIMD_T> ClipLineToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1017 {
1018 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1019
1020 Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
1021 Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
1022 Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1023
1024 if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1025 {
1026 Integer<SIMD_T> s = vCurIndex;
1027 Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1028
1029 // gather position
1030 Vec4<SIMD_T> vInPos0, vInPos1;
1031 for (uint32_t c = 0; c < 4; ++c)
1032 {
1033 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1034 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1035 }
1036
1037 // compute inside mask
1038 Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1039 Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1040
1041 // compute intersection mask (s_in != p_in)
1042 Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1043 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1044
1045 // store s if inside
1046 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1047 if (!SIMD_T::testz_ps(s_in, s_in))
1048 {
1049 for (uint32_t c = 0; c < 4; ++c)
1050 {
1051 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1052 }
1053
1054 // interpolate attributes and store
1055 for (uint32_t a = 0; a < numInAttribs; ++a)
1056 {
1057 uint32_t attribSlot = vertexAttribOffset + a;
1058 for (uint32_t c = 0; c < 4; ++c)
1059 {
1060 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1061 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1062 }
1063 }
1064
1065 // increment outIndex
1066 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1067 }
1068
1069 // compute and store intersection
1070 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1071 {
1072 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1073
1074 // increment outIndex for active lanes
1075 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1076 }
1077
1078 // store p if inside
1079 p_in = SIMD_T::and_ps(p_in, vActiveMask);
1080 if (!SIMD_T::testz_ps(p_in, p_in))
1081 {
1082 for (uint32_t c = 0; c < 4; ++c)
1083 {
1084 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1085 }
1086
1087 // interpolate attributes and store
1088 for (uint32_t a = 0; a < numInAttribs; ++a)
1089 {
1090 uint32_t attribSlot = vertexAttribOffset + a;
1091 for (uint32_t c = 0; c < 4; ++c)
1092 {
1093 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1094 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1095 }
1096 }
1097
1098 // increment outIndex
1099 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1100 }
1101 }
1102
1103 return vOutIndex;
1104 }
1105
1106 Integer<SIMD_T> ClipPrims(float *pVertices, const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, int numAttribs)
1107 {
1108 // temp storage
1109 float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1110
1111 // zero out num input verts for non-active lanes
1112 Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1113 vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1114
1115 // clip prims to frustum
1116 Integer<SIMD_T> vNumOutPts;
1117 if (NumVertsPerPrim == 3)
1118 {
1119 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1120 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1121 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1122 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1123 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1124 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1125 }
1126 else
1127 {
1128 SWR_ASSERT(NumVertsPerPrim == 2);
1129 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1130 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1131 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1132 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1133 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1134 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1135 }
1136
1137 // restore num verts for non-clipped, active lanes
1138 Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1139 vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1140
1141 return vNumOutPts;
1142 }
1143
1144 const uint32_t workerId{ 0 };
1145 DRAW_CONTEXT *pDC{ nullptr };
1146 const API_STATE &state;
1147 Float<SIMD_T> clipCodes[NumVertsPerPrim];
1148 };
1149
1150
1151 // pipeline stage functions
1152 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1153 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1154 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1155 #if USE_SIMD16_FRONTEND
1156 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1157 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1158 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1159 #endif
1160