swr/rast: Remove hardcoded clip/cull slot from clipper
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
39 #endif
40
41 enum SWR_CLIPCODES
42 {
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
47 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
48 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
49 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
50
51 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
52 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
53
54 NEGW = (0x40 << CLIPCODE_SHIFT),
55
56 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
57 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
58 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
59 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
60 };
61
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63
64 template<typename SIMD_T>
65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
66 {
67 clipCodes = SIMD_T::setzero_ps();
68
69 // -w
70 typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
71
72 // FRUSTUM_LEFT
73 typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
74 clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
75
76 // FRUSTUM_TOP
77 vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
78 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
79
80 // FRUSTUM_RIGHT
81 vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
82 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
83
84 // FRUSTUM_BOTTOM
85 vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
86 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
87
88 if (state.rastState.depthClipEnable)
89 {
90 // FRUSTUM_NEAR
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state.rastState.clipHalfZ)
93 {
94 vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
95 }
96 else
97 {
98 vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
99 }
100 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
101
102 // FRUSTUM_FAR
103 vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
104 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
105 }
106
107 // NEGW
108 vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
109 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
110
111 // GUARDBAND_LEFT
112 typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
113 vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
114 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
115
116 // GUARDBAND_TOP
117 gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
118 vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
119 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
120
121 // GUARDBAND_RIGHT
122 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
123 vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
124 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
125
126 // GUARDBAND_BOTTOM
127 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
128 vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
129 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
130 }
131
132 template<typename SIMD_T>
133 struct BinnerChooser
134 {
135 };
136
137 template<>
138 struct BinnerChooser<SIMD256>
139 {
140 PFN_PROCESS_PRIMS pfnBinFunc;
141
142 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
143 :pfnBinFunc(nullptr)
144 {
145 if (numVertsPerPrim == 3)
146 {
147 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
148
149 }
150 else if (numVertsPerPrim == 2)
151 {
152 pfnBinFunc = BinLines;
153 }
154 else
155 {
156 SWR_ASSERT(0 && "Unexpected points in clipper.");
157 }
158 }
159
160 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
161 :pfnBinFunc(nullptr)
162 {
163 switch (topology)
164 {
165 case TOP_POINT_LIST:
166 pfnBinFunc = BinPoints;
167 break;
168 case TOP_LINE_LIST:
169 case TOP_LINE_STRIP:
170 case TOP_LINE_LOOP:
171 case TOP_LINE_LIST_ADJ:
172 case TOP_LISTSTRIP_ADJ:
173 pfnBinFunc = BinLines;
174 break;
175 default:
176 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
177 break;
178 };
179 }
180
181 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID)
182 {
183 SWR_ASSERT(pfnBinFunc != nullptr);
184
185 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID);
186 }
187 };
188
189 #if USE_SIMD16_FRONTEND
190 template<>
191 struct BinnerChooser<SIMD512>
192 {
193 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
194
195 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
196 :pfnBinFunc(nullptr)
197 {
198 if (numVertsPerPrim == 3)
199 {
200 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
201
202 }
203 else if (numVertsPerPrim == 2)
204 {
205 pfnBinFunc = BinLines_simd16;
206 }
207 else
208 {
209 SWR_ASSERT(0 && "Unexpected points in clipper.");
210 }
211 }
212
213 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
214 :pfnBinFunc(nullptr)
215 {
216 switch (topology)
217 {
218 case TOP_POINT_LIST:
219 pfnBinFunc = BinPoints_simd16;
220 break;
221 case TOP_LINE_LIST:
222 case TOP_LINE_STRIP:
223 case TOP_LINE_LOOP:
224 case TOP_LINE_LIST_ADJ:
225 case TOP_LISTSTRIP_ADJ:
226 pfnBinFunc = BinLines_simd16;
227 break;
228 default:
229 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
230 break;
231 };
232 }
233
234 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID)
235 {
236 SWR_ASSERT(pfnBinFunc != nullptr);
237
238 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID);
239 }
240 };
241
242 #endif
243 template<typename SIMD_T>
244 struct SimdHelper
245 {
246 };
247
248 template<>
249 struct SimdHelper<SIMD256>
250 {
251 static SIMD256::Float insert_lo_ps(SIMD256::Float a)
252 {
253 return a;
254 }
255
256 static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
257 {
258 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
259 }
260 };
261
262 #if USE_SIMD16_FRONTEND
263 template<>
264 struct SimdHelper<SIMD512>
265 {
266 static SIMD512::Float insert_lo_ps(SIMD256::Float a)
267 {
268 return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
269 }
270
271 static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
272 {
273 return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
274 }
275 };
276
277 #endif
278 // Temp storage used by the clipper
279 template<typename SIMD_T>
280 struct ClipHelper
281 {
282 };
283
284 template<>
285 struct ClipHelper<SIMD256>
286 {
287 static SIMDVERTEX_T<SIMD256> *GetTempVertices()
288 {
289 return tlsTempVertices;
290 }
291 };
292
293 #if USE_SIMD16_FRONTEND
294 template<>
295 struct ClipHelper<SIMD512>
296 {
297 static SIMDVERTEX_T<SIMD512> *GetTempVertices()
298 {
299 return tlsTempVertices_simd16;
300 }
301 };
302
303 #endif
304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
305 class Clipper
306 {
307 public:
308 INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
309 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
310 {
311 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
312 }
313
314 void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
315 {
316 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
317 {
318 ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
319 }
320 }
321
322 typename SIMD_T::Float ComputeClipCodeIntersection()
323 {
324 typename SIMD_T::Float result = clipCodes[0];
325
326 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
327 {
328 result = SIMD_T::and_ps(result, clipCodes[i]);
329 }
330
331 return result;
332 }
333
334 typename SIMD_T::Float ComputeClipCodeUnion()
335 {
336 typename SIMD_T::Float result = clipCodes[0];
337
338 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
339 {
340 result = SIMD_T::or_ps(result, clipCodes[i]);
341 }
342
343 return result;
344 }
345
346 int ComputeClipMask()
347 {
348 typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
349
350 clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
351
352 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
353 }
354
355 // clipper is responsible for culling any prims with NAN coordinates
356 int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
357 {
358 typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
359
360 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
361 {
362 typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
363 vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
364
365 typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
366 vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
367 }
368
369 return SIMD_T::movemask_ps(vNanMask);
370 }
371
372 int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
373 {
374 uint8_t cullMask = state.backendState.cullDistanceMask;
375 uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
376
377 typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
378
379 typename SIMD_T::Vec4 vClipCullDistLo[3];
380 typename SIMD_T::Vec4 vClipCullDistHi[3];
381
382 pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
383 pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
384
385 DWORD index;
386 while (_BitScanForward(&index, cullMask))
387 {
388 cullMask &= ~(1 << index);
389 uint32_t slot = index >> 2;
390 uint32_t component = index & 0x3;
391
392 typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
393 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
394 {
395 typename SIMD_T::Float vCullComp;
396 if (slot == 0)
397 {
398 vCullComp = vClipCullDistLo[e][component];
399 }
400 else
401 {
402 vCullComp = vClipCullDistHi[e][component];
403 }
404
405 // cull if cull distance < 0 || NAN
406 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
407 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
408 }
409 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
410 }
411
412 // clipper should also discard any primitive with NAN clip distance
413 uint8_t clipMask = state.backendState.clipDistanceMask;
414 while (_BitScanForward(&index, clipMask))
415 {
416 clipMask &= ~(1 << index);
417 uint32_t slot = index >> 2;
418 uint32_t component = index & 0x3;
419
420 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
421 {
422 typename SIMD_T::Float vClipComp;
423 if (slot == 0)
424 {
425 vClipComp = vClipCullDistLo[e][component];
426 }
427 else
428 {
429 vClipComp = vClipCullDistHi[e][component];
430 }
431
432 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
433 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
434 }
435 }
436
437 return SIMD_T::movemask_ps(vClipCullMask);
438 }
439
440 void ClipSimd(const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa, const typename SIMD_T::Integer &vPrimId)
441 {
442 // input/output vertex store for clipper
443 SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
444
445 uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
446 uint32_t provokingVertex = 0;
447 if (pa.binTopology == TOP_TRIANGLE_FAN)
448 {
449 provokingVertex = state.frontendState.provokingVertex.triFan;
450 }
451 ///@todo: line topology for wireframe?
452
453 // assemble pos
454 typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
455 pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
456 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
457 {
458 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
459 }
460
461 // assemble attribs
462 const SWR_BACKEND_STATE& backendState = state.backendState;
463
464 int32_t maxSlot = -1;
465 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
466 {
467 // Compute absolute attrib slot in vertex array
468 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
469 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
470 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
471
472 pa.Assemble(inputSlot, tmpVector);
473
474 // if constant interpolation enabled for this attribute, assign the provoking
475 // vertex values to all edges
476 if (CheckBit(constantInterpMask, slot))
477 {
478 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
479 {
480 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
481 }
482 }
483 else
484 {
485 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
486 {
487 vertices[i].attrib[inputSlot] = tmpVector[i];
488 }
489 }
490 }
491
492 // assemble user clip distances if enabled
493 uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
494 if (state.backendState.clipDistanceMask & 0xf)
495 {
496 pa.Assemble(vertexClipCullSlot, tmpVector);
497 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
498 {
499 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
500 }
501 }
502
503 if (state.backendState.clipDistanceMask & 0xf0)
504 {
505 pa.Assemble(vertexClipCullSlot + 1, tmpVector);
506 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
507 {
508 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
509 }
510 }
511
512 uint32_t numAttribs = maxSlot + 1;
513
514 typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
515
516 BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
517
518 // set up new PA for binning clipped primitives
519 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
520 if (NumVertsPerPrim == 3)
521 {
522 clipTopology = TOP_TRIANGLE_FAN;
523
524 // so that the binner knows to bloat wide points later
525 if (pa.binTopology == TOP_POINT_LIST)
526 {
527 clipTopology = TOP_POINT_LIST;
528 }
529 }
530 else if (NumVertsPerPrim == 2)
531 {
532 clipTopology = TOP_LINE_LIST;
533 }
534 else
535 {
536 SWR_ASSERT(0 && "Unexpected points in clipper.");
537 }
538
539 const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
540 const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
541
542 const SIMD256::Integer vOffsets = SIMD256::set_epi32(
543 0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
544 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
545 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
546 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
547 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
548 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
549 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
550 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
551
552 // only need to gather 7 verts
553 // @todo dynamic mask based on actual # of verts generated per lane
554 const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
555
556 uint32_t numClippedPrims = 0;
557
558 // tranpose clipper output so that each lane's vertices are in SIMD order
559 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
560 // for triangle fan
561
562 #if defined(_DEBUG)
563 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
564 SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(malloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2));
565
566 #else
567 SIMDVERTEX_T<SIMD_T> transposedPrims[2];
568
569 #endif
570 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
571 {
572 uint32_t numEmittedVerts = pVertexCount[inputPrim];
573 if (numEmittedVerts < NumVertsPerPrim)
574 {
575 continue;
576 }
577 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
578
579 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
580 SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
581
582 numClippedPrims += numEmittedPrims;
583
584 // tranpose clipper output so that each lane's vertices are in SIMD order
585 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
586 // for triangle fan
587
588 // transpose pos
589 uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
590
591 #if 0
592 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
593 static const float *dummy = reinterpret_cast<const float *>(pBase);
594
595 #endif
596 for (uint32_t c = 0; c < 4; ++c)
597 {
598 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
599 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
600 pBase += sizeof(typename SIMD_T::Float);
601 }
602
603 // transpose attribs
604 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
605
606 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
607 {
608 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
609
610 for (uint32_t c = 0; c < 4; ++c)
611 {
612 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
613 transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
614 pBase += sizeof(typename SIMD_T::Float);
615 }
616 }
617
618 // transpose user clip distances if enabled
619 uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
620 if (state.backendState.clipDistanceMask & 0x0f)
621 {
622 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
623
624 for (uint32_t c = 0; c < 4; ++c)
625 {
626 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
627 transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
628 pBase += sizeof(typename SIMD_T::Float);
629 }
630 }
631
632 if (state.backendState.clipDistanceMask & 0xf0)
633 {
634 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
635
636 for (uint32_t c = 0; c < 4; ++c)
637 {
638 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
639 transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
640 pBase += sizeof(typename SIMD_T::Float);
641 }
642 }
643
644 PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
645
646 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
647
648 const uint32_t primMask = primMaskMap[numEmittedPrims];
649
650 const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
651
652 while (clipPA.GetNextStreamOutput())
653 {
654 do
655 {
656 typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
657
658 bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
659
660 if (assemble)
661 {
662 binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID);
663 }
664
665 } while (clipPA.NextPrim());
666 }
667 }
668
669 #if defined(_DEBUG)
670 free(transposedPrims);
671
672 #endif
673 // update global pipeline stat
674 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
675 }
676
677 void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask, typename SIMD_T::Integer const &primId)
678 {
679 SWR_ASSERT(pa.pDC != nullptr);
680
681 SWR_CONTEXT *pContext = pa.pDC->pContext;
682
683 BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
684
685 // update clipper invocations pipeline stat
686 uint32_t numInvoc = _mm_popcnt_u32(primMask);
687 UPDATE_STAT_FE(CInvocations, numInvoc);
688
689 // Read back viewport index if required
690 typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
691
692 if (state.backendState.readViewportArrayIndex)
693 {
694 typename SIMD_T::Vec4 vpiAttrib[NumVertsPerPrim];
695 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
696
697 // OOB indices => forced to zero.
698 typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
699 vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
700 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
701 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
702 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
703 }
704
705 ComputeClipCodes(prim, viewportIdx);
706
707 // cull prims with NAN coords
708 primMask &= ~ComputeNaNMask(prim);
709
710 // user cull distance cull
711 if (state.backendState.cullDistanceMask)
712 {
713 primMask &= ~ComputeUserClipCullMask(pa, prim);
714 }
715
716 // cull prims outside view frustum
717 typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
718 int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
719
720 // skip clipping for points
721 uint32_t clipMask = 0;
722 if (NumVertsPerPrim != 1)
723 {
724 clipMask = primMask & ComputeClipMask();
725 }
726
727 if (clipMask)
728 {
729 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
730 // we have to clip tris, execute the clipper, which will also
731 // call the binner
732 ClipSimd(SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId);
733 AR_END(FEGuardbandClip, 1);
734 }
735 else if (validMask)
736 {
737 // update CPrimitives pipeline state
738 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
739
740 // forward valid prims directly to binner
741 binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId);
742 }
743 }
744
745 private:
746 typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
747 {
748 return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
749 }
750
751 typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
752 {
753 const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
754 const uint32_t componentStride = sizeof(typename SIMD_T::Float);
755 const uint32_t attribStride = sizeof(typename SIMD_T::Vec4);
756
757 static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
758 {
759 0 * sizeof(float),
760 1 * sizeof(float),
761 2 * sizeof(float),
762 3 * sizeof(float),
763 4 * sizeof(float),
764 5 * sizeof(float),
765 6 * sizeof(float),
766 7 * sizeof(float),
767 8 * sizeof(float),
768 9 * sizeof(float),
769 10 * sizeof(float),
770 11 * sizeof(float),
771 12 * sizeof(float),
772 13 * sizeof(float),
773 14 * sizeof(float),
774 15 * sizeof(float),
775 };
776
777 static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
778
779 typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
780
781 // step to the simdvertex
782 typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
783
784 // step to the attribute and component
785 vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
786
787 // step to the lane
788 vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
789
790 return vOffsets;
791 }
792
793 typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
794 {
795 typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
796 typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
797
798 return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
799 }
800
801 void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
802 {
803 typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
804
805 const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
806 const float *pSrc = reinterpret_cast<const float *>(&vSrc);
807 uint32_t mask = SIMD_T::movemask_ps(vMask);
808 DWORD lane;
809 while (_BitScanForward(&lane, mask))
810 {
811 mask &= ~(1 << lane);
812 const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
813 *(float *)pBuf = pSrc[lane];
814 }
815 }
816
817 template<SWR_CLIPCODES ClippingPlane>
818 void intersect(
819 const typename SIMD_T::Float &vActiveMask, // active lanes to operate on
820 const typename SIMD_T::Integer &s, // index to first edge vertex v0 in pInPts.
821 const typename SIMD_T::Integer &p, // index to second edge vertex v1 in pInPts.
822 const typename SIMD_T::Vec4 &v1, // vertex 0 position
823 const typename SIMD_T::Vec4 &v2, // vertex 1 position
824 typename SIMD_T::Integer &outIndex, // output index.
825 const float *pInVerts, // array of all the input positions.
826 uint32_t numInAttribs, // number of attributes per vertex.
827 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
828 {
829 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
830 uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
831
832 // compute interpolation factor
833 typename SIMD_T::Float t;
834 switch (ClippingPlane)
835 {
836 case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
837 case FRUSTUM_RIGHT: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
838 case FRUSTUM_TOP: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
839 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
840 case FRUSTUM_NEAR:
841 // DX Znear plane is 0, GL is -w
842 if (this->state.rastState.clipHalfZ)
843 {
844 t = ComputeInterpFactor(v1[2], v2[2]);
845 }
846 else
847 {
848 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
849 }
850 break;
851 case FRUSTUM_FAR: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
852 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
853 };
854
855 // interpolate position and store
856 for (uint32_t c = 0; c < 4; ++c)
857 {
858 typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
859 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
860 }
861
862 // interpolate attributes and store
863 for (uint32_t a = 0; a < numInAttribs; ++a)
864 {
865 uint32_t attribSlot = vertexAttribOffset + a;
866 for (uint32_t c = 0; c < 4; ++c)
867 {
868 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
869 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
870 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
871 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
872 }
873 }
874
875 // interpolate clip distance if enabled
876 if (this->state.backendState.clipDistanceMask & 0xf)
877 {
878 uint32_t attribSlot = vertexClipCullOffset;
879 for (uint32_t c = 0; c < 4; ++c)
880 {
881 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
882 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
883 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
884 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
885 }
886 }
887
888 if (this->state.backendState.clipDistanceMask & 0xf0)
889 {
890 uint32_t attribSlot = vertexClipCullOffset + 1;
891 for (uint32_t c = 0; c < 4; ++c)
892 {
893 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
894 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
895 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
896 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
897 }
898 }
899 }
900
901 template<SWR_CLIPCODES ClippingPlane>
902 typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
903 {
904 switch (ClippingPlane)
905 {
906 case FRUSTUM_LEFT: return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
907 case FRUSTUM_RIGHT: return SIMD_T::cmple_ps(v[0], v[3]);
908 case FRUSTUM_TOP: return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
909 case FRUSTUM_BOTTOM: return SIMD_T::cmple_ps(v[1], v[3]);
910 case FRUSTUM_NEAR: return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
911 case FRUSTUM_FAR: return SIMD_T::cmple_ps(v[2], v[3]);
912 default:
913 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
914 return SIMD_T::setzero_ps();
915 }
916 }
917
918 template<SWR_CLIPCODES ClippingPlane>
919 typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
920 {
921 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
922
923 typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
924 typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
925 typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
926
927 while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
928 {
929 typename SIMD_T::Integer s = vCurIndex;
930 typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
931 typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
932 p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
933
934 // gather position
935 typename SIMD_T::Vec4 vInPos0, vInPos1;
936 for (uint32_t c = 0; c < 4; ++c)
937 {
938 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
939 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
940 }
941
942 // compute inside mask
943 typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
944 typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
945
946 // compute intersection mask (s_in != p_in)
947 typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
948 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
949
950 // store s if inside
951 s_in = SIMD_T::and_ps(s_in, vActiveMask);
952 if (!SIMD_T::testz_ps(s_in, s_in))
953 {
954 // store position
955 for (uint32_t c = 0; c < 4; ++c)
956 {
957 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
958 }
959
960 // store attribs
961 for (uint32_t a = 0; a < numInAttribs; ++a)
962 {
963 uint32_t attribSlot = vertexAttribOffset + a;
964 for (uint32_t c = 0; c < 4; ++c)
965 {
966 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
967 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
968 }
969 }
970
971 // store clip distance if enabled
972 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
973 if (this->state.backendState.clipDistanceMask & 0xf)
974 {
975 uint32_t attribSlot = vertexClipCullSlot;
976 for (uint32_t c = 0; c < 4; ++c)
977 {
978 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
979 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
980 }
981 }
982
983 if (this->state.backendState.clipDistanceMask & 0xf0)
984 {
985 uint32_t attribSlot = vertexClipCullSlot + 1;
986 for (uint32_t c = 0; c < 4; ++c)
987 {
988 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
989 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
990 }
991 }
992
993 // increment outIndex
994 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
995 }
996
997 // compute and store intersection
998 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
999 {
1000 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1001
1002 // increment outIndex for active lanes
1003 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1004 }
1005
1006 // increment loop index and update active mask
1007 vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1008 vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1009 }
1010
1011 return vOutIndex;
1012 }
1013
1014 template<SWR_CLIPCODES ClippingPlane>
1015 typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1016 {
1017 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1018
1019 typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
1020 typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
1021 typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1022
1023 if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1024 {
1025 typename SIMD_T::Integer s = vCurIndex;
1026 typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1027
1028 // gather position
1029 typename SIMD_T::Vec4 vInPos0, vInPos1;
1030 for (uint32_t c = 0; c < 4; ++c)
1031 {
1032 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1033 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1034 }
1035
1036 // compute inside mask
1037 typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
1038 typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
1039
1040 // compute intersection mask (s_in != p_in)
1041 typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
1042 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1043
1044 // store s if inside
1045 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1046 if (!SIMD_T::testz_ps(s_in, s_in))
1047 {
1048 for (uint32_t c = 0; c < 4; ++c)
1049 {
1050 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1051 }
1052
1053 // interpolate attributes and store
1054 for (uint32_t a = 0; a < numInAttribs; ++a)
1055 {
1056 uint32_t attribSlot = vertexAttribOffset + a;
1057 for (uint32_t c = 0; c < 4; ++c)
1058 {
1059 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1060 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1061 }
1062 }
1063
1064 // increment outIndex
1065 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1066 }
1067
1068 // compute and store intersection
1069 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1070 {
1071 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1072
1073 // increment outIndex for active lanes
1074 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1075 }
1076
1077 // store p if inside
1078 p_in = SIMD_T::and_ps(p_in, vActiveMask);
1079 if (!SIMD_T::testz_ps(p_in, p_in))
1080 {
1081 for (uint32_t c = 0; c < 4; ++c)
1082 {
1083 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1084 }
1085
1086 // interpolate attributes and store
1087 for (uint32_t a = 0; a < numInAttribs; ++a)
1088 {
1089 uint32_t attribSlot = vertexAttribOffset + a;
1090 for (uint32_t c = 0; c < 4; ++c)
1091 {
1092 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1093 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1094 }
1095 }
1096
1097 // increment outIndex
1098 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1099 }
1100 }
1101
1102 return vOutIndex;
1103 }
1104
1105 typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
1106 {
1107 // temp storage
1108 float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1109
1110 // zero out num input verts for non-active lanes
1111 typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1112 vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1113
1114 // clip prims to frustum
1115 typename SIMD_T::Integer vNumOutPts;
1116 if (NumVertsPerPrim == 3)
1117 {
1118 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1119 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1120 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1121 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1122 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1123 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1124 }
1125 else
1126 {
1127 SWR_ASSERT(NumVertsPerPrim == 2);
1128 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1129 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1130 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1131 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1132 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1133 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1134 }
1135
1136 // restore num verts for non-clipped, active lanes
1137 typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1138 vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1139
1140 return vNumOutPts;
1141 }
1142
1143 const uint32_t workerId{ 0 };
1144 DRAW_CONTEXT *pDC{ nullptr };
1145 const API_STATE &state;
1146 typename SIMD_T::Float clipCodes[NumVertsPerPrim];
1147 };
1148
1149
1150 // pipeline stage functions
1151 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
1152 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
1153 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
1154 #if USE_SIMD16_FRONTEND
1155 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
1156 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
1157 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
1158 #endif
1159