swr/rast: Removed unused variable
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
39 #endif
40
41 enum SWR_CLIPCODES
42 {
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
47 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
48 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
49 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
50
51 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
52 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
53
54 NEGW = (0x40 << CLIPCODE_SHIFT),
55
56 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
57 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
58 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
59 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
60 };
61
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63
64 template<typename SIMD_T>
65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
66 {
67 clipCodes = SIMD_T::setzero_ps();
68
69 // -w
70 typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
71
72 // FRUSTUM_LEFT
73 typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
74 clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
75
76 // FRUSTUM_TOP
77 vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
78 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
79
80 // FRUSTUM_RIGHT
81 vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
82 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
83
84 // FRUSTUM_BOTTOM
85 vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
86 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
87
88 if (state.rastState.depthClipEnable)
89 {
90 // FRUSTUM_NEAR
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state.rastState.clipHalfZ)
93 {
94 vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
95 }
96 else
97 {
98 vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
99 }
100 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
101
102 // FRUSTUM_FAR
103 vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
104 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
105 }
106
107 // NEGW
108 vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
109 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
110
111 // GUARDBAND_LEFT
112 typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
113 vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
114 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
115
116 // GUARDBAND_TOP
117 gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
118 vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
119 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
120
121 // GUARDBAND_RIGHT
122 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
123 vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
124 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
125
126 // GUARDBAND_BOTTOM
127 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
128 vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
129 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
130 }
131
132 template<typename SIMD_T>
133 struct BinnerChooser
134 {
135 };
136
137 template<>
138 struct BinnerChooser<SIMD256>
139 {
140 PFN_PROCESS_PRIMS pfnBinFunc;
141
142 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
143 :pfnBinFunc(nullptr)
144 {
145 if (numVertsPerPrim == 3)
146 {
147 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
148
149 }
150 else if (numVertsPerPrim == 2)
151 {
152 pfnBinFunc = BinLines;
153 }
154 else
155 {
156 SWR_ASSERT(0 && "Unexpected points in clipper.");
157 }
158 }
159
160 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
161 :pfnBinFunc(nullptr)
162 {
163 switch (topology)
164 {
165 case TOP_POINT_LIST:
166 pfnBinFunc = BinPoints;
167 break;
168 case TOP_LINE_LIST:
169 case TOP_LINE_STRIP:
170 case TOP_LINE_LOOP:
171 case TOP_LINE_LIST_ADJ:
172 case TOP_LISTSTRIP_ADJ:
173 pfnBinFunc = BinLines;
174 break;
175 default:
176 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
177 break;
178 };
179 }
180
181 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
182 {
183 SWR_ASSERT(pfnBinFunc != nullptr);
184
185 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
186 }
187 };
188
189 #if USE_SIMD16_FRONTEND
190 template<>
191 struct BinnerChooser<SIMD512>
192 {
193 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
194
195 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
196 :pfnBinFunc(nullptr)
197 {
198 if (numVertsPerPrim == 3)
199 {
200 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
201
202 }
203 else if (numVertsPerPrim == 2)
204 {
205 pfnBinFunc = BinLines_simd16;
206 }
207 else
208 {
209 SWR_ASSERT(0 && "Unexpected points in clipper.");
210 }
211 }
212
213 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
214 :pfnBinFunc(nullptr)
215 {
216 switch (topology)
217 {
218 case TOP_POINT_LIST:
219 pfnBinFunc = BinPoints_simd16;
220 break;
221 case TOP_LINE_LIST:
222 case TOP_LINE_STRIP:
223 case TOP_LINE_LOOP:
224 case TOP_LINE_LIST_ADJ:
225 case TOP_LISTSTRIP_ADJ:
226 pfnBinFunc = BinLines_simd16;
227 break;
228 default:
229 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
230 break;
231 };
232 }
233
234 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
235 {
236 SWR_ASSERT(pfnBinFunc != nullptr);
237
238 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
239 }
240 };
241
242 #endif
243 template<typename SIMD_T>
244 struct SimdHelper
245 {
246 };
247
248 template<>
249 struct SimdHelper<SIMD256>
250 {
251 static SIMD256::Float insert_lo_ps(SIMD256::Float a)
252 {
253 return a;
254 }
255
256 static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
257 {
258 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
259 }
260 };
261
262 #if USE_SIMD16_FRONTEND
263 template<>
264 struct SimdHelper<SIMD512>
265 {
266 static SIMD512::Float insert_lo_ps(SIMD256::Float a)
267 {
268 return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
269 }
270
271 static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
272 {
273 return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
274 }
275 };
276
277 #endif
278 // Temp storage used by the clipper
279 template<typename SIMD_T>
280 struct ClipHelper
281 {
282 };
283
284 template<>
285 struct ClipHelper<SIMD256>
286 {
287 static SIMDVERTEX_T<SIMD256> *GetTempVertices()
288 {
289 return tlsTempVertices;
290 }
291 };
292
293 #if USE_SIMD16_FRONTEND
294 template<>
295 struct ClipHelper<SIMD512>
296 {
297 static SIMDVERTEX_T<SIMD512> *GetTempVertices()
298 {
299 return tlsTempVertices_simd16;
300 }
301 };
302
303 #endif
304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
305 class Clipper
306 {
307 public:
308 INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
309 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
310 {
311 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
312 }
313
314 void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
315 {
316 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
317 {
318 ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
319 }
320 }
321
322 typename SIMD_T::Float ComputeClipCodeIntersection()
323 {
324 typename SIMD_T::Float result = clipCodes[0];
325
326 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
327 {
328 result = SIMD_T::and_ps(result, clipCodes[i]);
329 }
330
331 return result;
332 }
333
334 typename SIMD_T::Float ComputeClipCodeUnion()
335 {
336 typename SIMD_T::Float result = clipCodes[0];
337
338 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
339 {
340 result = SIMD_T::or_ps(result, clipCodes[i]);
341 }
342
343 return result;
344 }
345
346 int ComputeClipMask()
347 {
348 typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
349
350 clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
351
352 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
353 }
354
355 // clipper is responsible for culling any prims with NAN coordinates
356 int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
357 {
358 typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
359
360 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
361 {
362 typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
363 vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
364
365 typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
366 vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
367 }
368
369 return SIMD_T::movemask_ps(vNanMask);
370 }
371
372 int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
373 {
374 uint8_t cullMask = state.backendState.cullDistanceMask;
375 uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
376
377 typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
378
379 typename SIMD_T::Vec4 vClipCullDistLo[3];
380 typename SIMD_T::Vec4 vClipCullDistHi[3];
381
382 pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
383 pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
384
385 DWORD index;
386 while (_BitScanForward(&index, cullMask))
387 {
388 cullMask &= ~(1 << index);
389 uint32_t slot = index >> 2;
390 uint32_t component = index & 0x3;
391
392 typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
393 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
394 {
395 typename SIMD_T::Float vCullComp;
396 if (slot == 0)
397 {
398 vCullComp = vClipCullDistLo[e][component];
399 }
400 else
401 {
402 vCullComp = vClipCullDistHi[e][component];
403 }
404
405 // cull if cull distance < 0 || NAN
406 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
407 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
408 }
409 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
410 }
411
412 // clipper should also discard any primitive with NAN clip distance
413 uint8_t clipMask = state.backendState.clipDistanceMask;
414 while (_BitScanForward(&index, clipMask))
415 {
416 clipMask &= ~(1 << index);
417 uint32_t slot = index >> 2;
418 uint32_t component = index & 0x3;
419
420 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
421 {
422 typename SIMD_T::Float vClipComp;
423 if (slot == 0)
424 {
425 vClipComp = vClipCullDistLo[e][component];
426 }
427 else
428 {
429 vClipComp = vClipCullDistHi[e][component];
430 }
431
432 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
433 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
434 }
435 }
436
437 return SIMD_T::movemask_ps(vClipCullMask);
438 }
439
440 void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
441 const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
442 {
443 // input/output vertex store for clipper
444 SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
445
446 uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
447 uint32_t provokingVertex = 0;
448 if (pa.binTopology == TOP_TRIANGLE_FAN)
449 {
450 provokingVertex = state.frontendState.provokingVertex.triFan;
451 }
452 ///@todo: line topology for wireframe?
453
454 // assemble pos
455 typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
456 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
457 {
458 vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
459 }
460
461 // assemble attribs
462 const SWR_BACKEND_STATE& backendState = state.backendState;
463
464 int32_t maxSlot = -1;
465 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
466 {
467 // Compute absolute attrib slot in vertex array
468 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
469 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
470 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
471
472 pa.Assemble(inputSlot, tmpVector);
473
474 // if constant interpolation enabled for this attribute, assign the provoking
475 // vertex values to all edges
476 if (CheckBit(constantInterpMask, slot))
477 {
478 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
479 {
480 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
481 }
482 }
483 else
484 {
485 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
486 {
487 vertices[i].attrib[inputSlot] = tmpVector[i];
488 }
489 }
490 }
491
492 // assemble user clip distances if enabled
493 uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
494 if (state.backendState.clipDistanceMask & 0xf)
495 {
496 pa.Assemble(vertexClipCullSlot, tmpVector);
497 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
498 {
499 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
500 }
501 }
502
503 if (state.backendState.clipDistanceMask & 0xf0)
504 {
505 pa.Assemble(vertexClipCullSlot + 1, tmpVector);
506 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
507 {
508 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
509 }
510 }
511
512 uint32_t numAttribs = maxSlot + 1;
513
514 typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
515
516 BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
517
518 // set up new PA for binning clipped primitives
519 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
520 if (NumVertsPerPrim == 3)
521 {
522 clipTopology = TOP_TRIANGLE_FAN;
523
524 // so that the binner knows to bloat wide points later
525 if (pa.binTopology == TOP_POINT_LIST)
526 {
527 clipTopology = TOP_POINT_LIST;
528 }
529 }
530 else if (NumVertsPerPrim == 2)
531 {
532 clipTopology = TOP_LINE_LIST;
533 }
534 else
535 {
536 SWR_ASSERT(0 && "Unexpected points in clipper.");
537 }
538
539 const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
540 const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
541 const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
542 const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
543
544 const SIMD256::Integer vOffsets = SIMD256::set_epi32(
545 0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
546 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
547 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
548 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
549 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
550 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
551 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
552 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
553
554 // only need to gather 7 verts
555 // @todo dynamic mask based on actual # of verts generated per lane
556 const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
557
558 uint32_t numClippedPrims = 0;
559
560 // tranpose clipper output so that each lane's vertices are in SIMD order
561 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
562 // for triangle fan
563
564 #if defined(_DEBUG)
565 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
566 SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
567
568 #else
569 SIMDVERTEX_T<SIMD_T> transposedPrims[2];
570
571 #endif
572 uint32_t numInputPrims = pa.NumPrims();
573 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
574 {
575 uint32_t numEmittedVerts = pVertexCount[inputPrim];
576 if (numEmittedVerts < NumVertsPerPrim)
577 {
578 continue;
579 }
580 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
581
582 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
583 SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
584
585 numClippedPrims += numEmittedPrims;
586
587 // tranpose clipper output so that each lane's vertices are in SIMD order
588 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
589 // for triangle fan
590
591 // transpose pos
592 uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
593
594 #if 0
595 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
596 static const float *dummy = reinterpret_cast<const float *>(pBase);
597
598 #endif
599 for (uint32_t c = 0; c < 4; ++c)
600 {
601 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
602 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
603 pBase += sizeof(typename SIMD_T::Float);
604 }
605
606 // transpose attribs
607 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
608
609 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
610 {
611 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
612
613 for (uint32_t c = 0; c < 4; ++c)
614 {
615 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
616 transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
617 pBase += sizeof(typename SIMD_T::Float);
618 }
619 }
620
621 // transpose user clip distances if enabled
622 uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
623 if (state.backendState.clipDistanceMask & 0x0f)
624 {
625 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
626
627 for (uint32_t c = 0; c < 4; ++c)
628 {
629 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
630 transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
631 pBase += sizeof(typename SIMD_T::Float);
632 }
633 }
634
635 if (state.backendState.clipDistanceMask & 0xf0)
636 {
637 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
638
639 for (uint32_t c = 0; c < 4; ++c)
640 {
641 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
642 transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
643 pBase += sizeof(typename SIMD_T::Float);
644 }
645 }
646
647 PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
648 clipPA.viewportArrayActive = pa.viewportArrayActive;
649 clipPA.rtArrayActive = pa.rtArrayActive;
650
651 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
652
653 const uint32_t primMask = primMaskMap[numEmittedPrims];
654
655 const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
656 const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
657 const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
658
659
660 while (clipPA.GetNextStreamOutput())
661 {
662 do
663 {
664 typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
665
666 bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
667
668 if (assemble)
669 {
670 binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
671 }
672
673 } while (clipPA.NextPrim());
674 }
675 }
676
677 #if defined(_DEBUG)
678 AlignedFree(transposedPrims);
679
680 #endif
681 // update global pipeline stat
682 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
683 }
684
685 void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
686 typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
687 {
688 SWR_ASSERT(pa.pDC != nullptr);
689
690 BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
691
692 // update clipper invocations pipeline stat
693 uint32_t numInvoc = _mm_popcnt_u32(primMask);
694 UPDATE_STAT_FE(CInvocations, numInvoc);
695
696 ComputeClipCodes(prim, viewportIdx);
697
698 // cull prims with NAN coords
699 primMask &= ~ComputeNaNMask(prim);
700
701 // user cull distance cull
702 if (state.backendState.cullDistanceMask)
703 {
704 primMask &= ~ComputeUserClipCullMask(pa, prim);
705 }
706
707 // cull prims outside view frustum
708 typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
709 int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
710
711 // skip clipping for points
712 uint32_t clipMask = 0;
713 if (NumVertsPerPrim != 1)
714 {
715 clipMask = primMask & ComputeClipMask();
716 }
717
718 if (clipMask)
719 {
720 RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
721 // we have to clip tris, execute the clipper, which will also
722 // call the binner
723 ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
724 RDTSC_END(FEGuardbandClip, 1);
725 }
726 else if (validMask)
727 {
728 // update CPrimitives pipeline state
729 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
730
731 // forward valid prims directly to binner
732 binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
733 }
734 }
735
736 private:
737 typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
738 {
739 return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
740 }
741
742 typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
743 {
744 const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
745 const uint32_t componentStride = sizeof(typename SIMD_T::Float);
746 const uint32_t attribStride = sizeof(typename SIMD_T::Vec4);
747
748 static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
749 {
750 0 * sizeof(float),
751 1 * sizeof(float),
752 2 * sizeof(float),
753 3 * sizeof(float),
754 4 * sizeof(float),
755 5 * sizeof(float),
756 6 * sizeof(float),
757 7 * sizeof(float),
758 8 * sizeof(float),
759 9 * sizeof(float),
760 10 * sizeof(float),
761 11 * sizeof(float),
762 12 * sizeof(float),
763 13 * sizeof(float),
764 14 * sizeof(float),
765 15 * sizeof(float),
766 };
767
768 static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
769
770 typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
771
772 // step to the simdvertex
773 typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
774
775 // step to the attribute and component
776 vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
777
778 // step to the lane
779 vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
780
781 return vOffsets;
782 }
783
784 typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
785 {
786 typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
787 typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
788
789 return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
790 }
791
792 void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
793 {
794 typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
795
796 const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
797 const float *pSrc = reinterpret_cast<const float *>(&vSrc);
798 uint32_t mask = SIMD_T::movemask_ps(vMask);
799 DWORD lane;
800 while (_BitScanForward(&lane, mask))
801 {
802 mask &= ~(1 << lane);
803 const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
804 *(float *)pBuf = pSrc[lane];
805 }
806 }
807
808 template<SWR_CLIPCODES ClippingPlane>
809 void intersect(
810 const typename SIMD_T::Float &vActiveMask, // active lanes to operate on
811 const typename SIMD_T::Integer &s, // index to first edge vertex v0 in pInPts.
812 const typename SIMD_T::Integer &p, // index to second edge vertex v1 in pInPts.
813 const typename SIMD_T::Vec4 &v1, // vertex 0 position
814 const typename SIMD_T::Vec4 &v2, // vertex 1 position
815 typename SIMD_T::Integer &outIndex, // output index.
816 const float *pInVerts, // array of all the input positions.
817 uint32_t numInAttribs, // number of attributes per vertex.
818 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
819 {
820 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
821 uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
822
823 // compute interpolation factor
824 typename SIMD_T::Float t;
825 switch (ClippingPlane)
826 {
827 case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
828 case FRUSTUM_RIGHT: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
829 case FRUSTUM_TOP: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
830 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
831 case FRUSTUM_NEAR:
832 // DX Znear plane is 0, GL is -w
833 if (this->state.rastState.clipHalfZ)
834 {
835 t = ComputeInterpFactor(v1[2], v2[2]);
836 }
837 else
838 {
839 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
840 }
841 break;
842 case FRUSTUM_FAR: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
843 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
844 };
845
846 // interpolate position and store
847 for (uint32_t c = 0; c < 4; ++c)
848 {
849 typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
850 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
851 }
852
853 // interpolate attributes and store
854 for (uint32_t a = 0; a < numInAttribs; ++a)
855 {
856 uint32_t attribSlot = vertexAttribOffset + a;
857 for (uint32_t c = 0; c < 4; ++c)
858 {
859 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
860 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
861 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
862 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
863 }
864 }
865
866 // interpolate clip distance if enabled
867 if (this->state.backendState.clipDistanceMask & 0xf)
868 {
869 uint32_t attribSlot = vertexClipCullOffset;
870 for (uint32_t c = 0; c < 4; ++c)
871 {
872 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
873 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
874 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
875 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
876 }
877 }
878
879 if (this->state.backendState.clipDistanceMask & 0xf0)
880 {
881 uint32_t attribSlot = vertexClipCullOffset + 1;
882 for (uint32_t c = 0; c < 4; ++c)
883 {
884 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
885 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
886 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
887 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
888 }
889 }
890 }
891
892 template<SWR_CLIPCODES ClippingPlane>
893 typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
894 {
895 switch (ClippingPlane)
896 {
897 case FRUSTUM_LEFT: return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
898 case FRUSTUM_RIGHT: return SIMD_T::cmple_ps(v[0], v[3]);
899 case FRUSTUM_TOP: return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
900 case FRUSTUM_BOTTOM: return SIMD_T::cmple_ps(v[1], v[3]);
901 case FRUSTUM_NEAR: return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
902 case FRUSTUM_FAR: return SIMD_T::cmple_ps(v[2], v[3]);
903 default:
904 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
905 return SIMD_T::setzero_ps();
906 }
907 }
908
909 template<SWR_CLIPCODES ClippingPlane>
910 typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
911 {
912 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
913
914 typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
915 typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
916 typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
917
918 while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
919 {
920 typename SIMD_T::Integer s = vCurIndex;
921 typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
922 typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
923 p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
924
925 // gather position
926 typename SIMD_T::Vec4 vInPos0, vInPos1;
927 for (uint32_t c = 0; c < 4; ++c)
928 {
929 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
930 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
931 }
932
933 // compute inside mask
934 typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
935 typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
936
937 // compute intersection mask (s_in != p_in)
938 typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
939 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
940
941 // store s if inside
942 s_in = SIMD_T::and_ps(s_in, vActiveMask);
943 if (!SIMD_T::testz_ps(s_in, s_in))
944 {
945 // store position
946 for (uint32_t c = 0; c < 4; ++c)
947 {
948 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
949 }
950
951 // store attribs
952 for (uint32_t a = 0; a < numInAttribs; ++a)
953 {
954 uint32_t attribSlot = vertexAttribOffset + a;
955 for (uint32_t c = 0; c < 4; ++c)
956 {
957 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
958 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
959 }
960 }
961
962 // store clip distance if enabled
963 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
964 if (this->state.backendState.clipDistanceMask & 0xf)
965 {
966 uint32_t attribSlot = vertexClipCullSlot;
967 for (uint32_t c = 0; c < 4; ++c)
968 {
969 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
970 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
971 }
972 }
973
974 if (this->state.backendState.clipDistanceMask & 0xf0)
975 {
976 uint32_t attribSlot = vertexClipCullSlot + 1;
977 for (uint32_t c = 0; c < 4; ++c)
978 {
979 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
980 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
981 }
982 }
983
984 // increment outIndex
985 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
986 }
987
988 // compute and store intersection
989 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
990 {
991 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
992
993 // increment outIndex for active lanes
994 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
995 }
996
997 // increment loop index and update active mask
998 vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
999 vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1000 }
1001
1002 return vOutIndex;
1003 }
1004
1005 template<SWR_CLIPCODES ClippingPlane>
1006 typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1007 {
1008 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1009
1010 typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
1011 typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
1012 typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1013
1014 if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1015 {
1016 typename SIMD_T::Integer s = vCurIndex;
1017 typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1018
1019 // gather position
1020 typename SIMD_T::Vec4 vInPos0, vInPos1;
1021 for (uint32_t c = 0; c < 4; ++c)
1022 {
1023 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1024 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1025 }
1026
1027 // compute inside mask
1028 typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
1029 typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
1030
1031 // compute intersection mask (s_in != p_in)
1032 typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
1033 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1034
1035 // store s if inside
1036 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1037 if (!SIMD_T::testz_ps(s_in, s_in))
1038 {
1039 for (uint32_t c = 0; c < 4; ++c)
1040 {
1041 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1042 }
1043
1044 // interpolate attributes and store
1045 for (uint32_t a = 0; a < numInAttribs; ++a)
1046 {
1047 uint32_t attribSlot = vertexAttribOffset + a;
1048 for (uint32_t c = 0; c < 4; ++c)
1049 {
1050 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1051 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1052 }
1053 }
1054
1055 // increment outIndex
1056 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1057 }
1058
1059 // compute and store intersection
1060 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1061 {
1062 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1063
1064 // increment outIndex for active lanes
1065 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1066 }
1067
1068 // store p if inside
1069 p_in = SIMD_T::and_ps(p_in, vActiveMask);
1070 if (!SIMD_T::testz_ps(p_in, p_in))
1071 {
1072 for (uint32_t c = 0; c < 4; ++c)
1073 {
1074 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1075 }
1076
1077 // interpolate attributes and store
1078 for (uint32_t a = 0; a < numInAttribs; ++a)
1079 {
1080 uint32_t attribSlot = vertexAttribOffset + a;
1081 for (uint32_t c = 0; c < 4; ++c)
1082 {
1083 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1084 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1085 }
1086 }
1087
1088 // increment outIndex
1089 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1090 }
1091 }
1092
1093 return vOutIndex;
1094 }
1095
1096 typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
1097 {
1098 // temp storage
1099 float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1100
1101 // zero out num input verts for non-active lanes
1102 typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1103 vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1104
1105 // clip prims to frustum
1106 typename SIMD_T::Integer vNumOutPts;
1107 if (NumVertsPerPrim == 3)
1108 {
1109 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1110 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1111 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1112 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1113 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1114 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1115 }
1116 else
1117 {
1118 SWR_ASSERT(NumVertsPerPrim == 2);
1119 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1120 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1121 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1122 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1123 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1124 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1125 }
1126
1127 // restore num verts for non-clipped, active lanes
1128 typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1129 vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1130
1131 return vNumOutPts;
1132 }
1133
1134 const uint32_t workerId{ 0 };
1135 DRAW_CONTEXT *pDC{ nullptr };
1136 const API_STATE &state;
1137 typename SIMD_T::Float clipCodes[NumVertsPerPrim];
1138 };
1139
1140
1141 // pipeline stage functions
1142 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1143 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1144 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1145 #if USE_SIMD16_FRONTEND
1146 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1147 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1148 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1149 #endif
1150