swr/rast: Move more RTAI handling out of binner
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
39 #endif
40
41 enum SWR_CLIPCODES
42 {
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
47 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
48 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
49 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
50
51 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
52 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
53
54 NEGW = (0x40 << CLIPCODE_SHIFT),
55
56 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
57 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
58 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
59 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
60 };
61
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63
64 template<typename SIMD_T>
65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
66 {
67 clipCodes = SIMD_T::setzero_ps();
68
69 // -w
70 typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
71
72 // FRUSTUM_LEFT
73 typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
74 clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
75
76 // FRUSTUM_TOP
77 vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
78 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
79
80 // FRUSTUM_RIGHT
81 vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
82 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
83
84 // FRUSTUM_BOTTOM
85 vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
86 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
87
88 if (state.rastState.depthClipEnable)
89 {
90 // FRUSTUM_NEAR
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state.rastState.clipHalfZ)
93 {
94 vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
95 }
96 else
97 {
98 vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
99 }
100 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
101
102 // FRUSTUM_FAR
103 vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
104 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
105 }
106
107 // NEGW
108 vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
109 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
110
111 // GUARDBAND_LEFT
112 typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
113 vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
114 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
115
116 // GUARDBAND_TOP
117 gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
118 vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
119 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
120
121 // GUARDBAND_RIGHT
122 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
123 vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
124 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
125
126 // GUARDBAND_BOTTOM
127 gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
128 vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
129 clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
130 }
131
132 template<typename SIMD_T>
133 struct BinnerChooser
134 {
135 };
136
137 template<>
138 struct BinnerChooser<SIMD256>
139 {
140 PFN_PROCESS_PRIMS pfnBinFunc;
141
142 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
143 :pfnBinFunc(nullptr)
144 {
145 if (numVertsPerPrim == 3)
146 {
147 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
148
149 }
150 else if (numVertsPerPrim == 2)
151 {
152 pfnBinFunc = BinLines;
153 }
154 else
155 {
156 SWR_ASSERT(0 && "Unexpected points in clipper.");
157 }
158 }
159
160 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
161 :pfnBinFunc(nullptr)
162 {
163 switch (topology)
164 {
165 case TOP_POINT_LIST:
166 pfnBinFunc = BinPoints;
167 break;
168 case TOP_LINE_LIST:
169 case TOP_LINE_STRIP:
170 case TOP_LINE_LOOP:
171 case TOP_LINE_LIST_ADJ:
172 case TOP_LISTSTRIP_ADJ:
173 pfnBinFunc = BinLines;
174 break;
175 default:
176 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
177 break;
178 };
179 }
180
181 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
182 {
183 SWR_ASSERT(pfnBinFunc != nullptr);
184
185 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
186 }
187 };
188
189 #if USE_SIMD16_FRONTEND
190 template<>
191 struct BinnerChooser<SIMD512>
192 {
193 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
194
195 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
196 :pfnBinFunc(nullptr)
197 {
198 if (numVertsPerPrim == 3)
199 {
200 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
201
202 }
203 else if (numVertsPerPrim == 2)
204 {
205 pfnBinFunc = BinLines_simd16;
206 }
207 else
208 {
209 SWR_ASSERT(0 && "Unexpected points in clipper.");
210 }
211 }
212
213 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
214 :pfnBinFunc(nullptr)
215 {
216 switch (topology)
217 {
218 case TOP_POINT_LIST:
219 pfnBinFunc = BinPoints_simd16;
220 break;
221 case TOP_LINE_LIST:
222 case TOP_LINE_STRIP:
223 case TOP_LINE_LOOP:
224 case TOP_LINE_LIST_ADJ:
225 case TOP_LISTSTRIP_ADJ:
226 pfnBinFunc = BinLines_simd16;
227 break;
228 default:
229 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
230 break;
231 };
232 }
233
234 void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
235 {
236 SWR_ASSERT(pfnBinFunc != nullptr);
237
238 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
239 }
240 };
241
242 #endif
243 template<typename SIMD_T>
244 struct SimdHelper
245 {
246 };
247
248 template<>
249 struct SimdHelper<SIMD256>
250 {
251 static SIMD256::Float insert_lo_ps(SIMD256::Float a)
252 {
253 return a;
254 }
255
256 static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
257 {
258 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
259 }
260 };
261
262 #if USE_SIMD16_FRONTEND
263 template<>
264 struct SimdHelper<SIMD512>
265 {
266 static SIMD512::Float insert_lo_ps(SIMD256::Float a)
267 {
268 return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
269 }
270
271 static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
272 {
273 return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
274 }
275 };
276
277 #endif
278 // Temp storage used by the clipper
279 template<typename SIMD_T>
280 struct ClipHelper
281 {
282 };
283
284 template<>
285 struct ClipHelper<SIMD256>
286 {
287 static SIMDVERTEX_T<SIMD256> *GetTempVertices()
288 {
289 return tlsTempVertices;
290 }
291 };
292
293 #if USE_SIMD16_FRONTEND
294 template<>
295 struct ClipHelper<SIMD512>
296 {
297 static SIMDVERTEX_T<SIMD512> *GetTempVertices()
298 {
299 return tlsTempVertices_simd16;
300 }
301 };
302
303 #endif
304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
305 class Clipper
306 {
307 public:
308 INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
309 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
310 {
311 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
312 }
313
314 void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
315 {
316 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
317 {
318 ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
319 }
320 }
321
322 typename SIMD_T::Float ComputeClipCodeIntersection()
323 {
324 typename SIMD_T::Float result = clipCodes[0];
325
326 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
327 {
328 result = SIMD_T::and_ps(result, clipCodes[i]);
329 }
330
331 return result;
332 }
333
334 typename SIMD_T::Float ComputeClipCodeUnion()
335 {
336 typename SIMD_T::Float result = clipCodes[0];
337
338 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
339 {
340 result = SIMD_T::or_ps(result, clipCodes[i]);
341 }
342
343 return result;
344 }
345
346 int ComputeClipMask()
347 {
348 typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
349
350 clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
351
352 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
353 }
354
355 // clipper is responsible for culling any prims with NAN coordinates
356 int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
357 {
358 typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
359
360 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
361 {
362 typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
363 vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
364
365 typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
366 vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
367 }
368
369 return SIMD_T::movemask_ps(vNanMask);
370 }
371
372 int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
373 {
374 uint8_t cullMask = state.backendState.cullDistanceMask;
375 uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
376
377 typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
378
379 typename SIMD_T::Vec4 vClipCullDistLo[3];
380 typename SIMD_T::Vec4 vClipCullDistHi[3];
381
382 pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
383 pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
384
385 DWORD index;
386 while (_BitScanForward(&index, cullMask))
387 {
388 cullMask &= ~(1 << index);
389 uint32_t slot = index >> 2;
390 uint32_t component = index & 0x3;
391
392 typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
393 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
394 {
395 typename SIMD_T::Float vCullComp;
396 if (slot == 0)
397 {
398 vCullComp = vClipCullDistLo[e][component];
399 }
400 else
401 {
402 vCullComp = vClipCullDistHi[e][component];
403 }
404
405 // cull if cull distance < 0 || NAN
406 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
407 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
408 }
409 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
410 }
411
412 // clipper should also discard any primitive with NAN clip distance
413 uint8_t clipMask = state.backendState.clipDistanceMask;
414 while (_BitScanForward(&index, clipMask))
415 {
416 clipMask &= ~(1 << index);
417 uint32_t slot = index >> 2;
418 uint32_t component = index & 0x3;
419
420 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
421 {
422 typename SIMD_T::Float vClipComp;
423 if (slot == 0)
424 {
425 vClipComp = vClipCullDistLo[e][component];
426 }
427 else
428 {
429 vClipComp = vClipCullDistHi[e][component];
430 }
431
432 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
433 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
434 }
435 }
436
437 return SIMD_T::movemask_ps(vClipCullMask);
438 }
439
440 void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
441 const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
442 {
443 // input/output vertex store for clipper
444 SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
445
446 uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
447 uint32_t provokingVertex = 0;
448 if (pa.binTopology == TOP_TRIANGLE_FAN)
449 {
450 provokingVertex = state.frontendState.provokingVertex.triFan;
451 }
452 ///@todo: line topology for wireframe?
453
454 // assemble pos
455 typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
456 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
457 {
458 vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
459 }
460
461 // assemble attribs
462 const SWR_BACKEND_STATE& backendState = state.backendState;
463
464 int32_t maxSlot = -1;
465 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
466 {
467 // Compute absolute attrib slot in vertex array
468 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
469 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
470 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
471
472 pa.Assemble(inputSlot, tmpVector);
473
474 // if constant interpolation enabled for this attribute, assign the provoking
475 // vertex values to all edges
476 if (CheckBit(constantInterpMask, slot))
477 {
478 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
479 {
480 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
481 }
482 }
483 else
484 {
485 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
486 {
487 vertices[i].attrib[inputSlot] = tmpVector[i];
488 }
489 }
490 }
491
492 // assemble user clip distances if enabled
493 uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
494 if (state.backendState.clipDistanceMask & 0xf)
495 {
496 pa.Assemble(vertexClipCullSlot, tmpVector);
497 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
498 {
499 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
500 }
501 }
502
503 if (state.backendState.clipDistanceMask & 0xf0)
504 {
505 pa.Assemble(vertexClipCullSlot + 1, tmpVector);
506 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
507 {
508 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
509 }
510 }
511
512 uint32_t numAttribs = maxSlot + 1;
513
514 typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
515
516 BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
517
518 // set up new PA for binning clipped primitives
519 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
520 if (NumVertsPerPrim == 3)
521 {
522 clipTopology = TOP_TRIANGLE_FAN;
523
524 // so that the binner knows to bloat wide points later
525 if (pa.binTopology == TOP_POINT_LIST)
526 {
527 clipTopology = TOP_POINT_LIST;
528 }
529 }
530 else if (NumVertsPerPrim == 2)
531 {
532 clipTopology = TOP_LINE_LIST;
533 }
534 else
535 {
536 SWR_ASSERT(0 && "Unexpected points in clipper.");
537 }
538
539 const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
540 const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
541 const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
542 const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
543
544 const SIMD256::Integer vOffsets = SIMD256::set_epi32(
545 0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
546 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
547 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
548 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
549 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
550 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
551 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
552 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
553
554 // only need to gather 7 verts
555 // @todo dynamic mask based on actual # of verts generated per lane
556 const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
557
558 uint32_t numClippedPrims = 0;
559
560 // tranpose clipper output so that each lane's vertices are in SIMD order
561 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
562 // for triangle fan
563
564 #if defined(_DEBUG)
565 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
566 SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
567
568 #else
569 SIMDVERTEX_T<SIMD_T> transposedPrims[2];
570
571 #endif
572 uint32_t numInputPrims = pa.NumPrims();
573 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
574 {
575 uint32_t numEmittedVerts = pVertexCount[inputPrim];
576 if (numEmittedVerts < NumVertsPerPrim)
577 {
578 continue;
579 }
580 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
581
582 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
583 SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
584
585 numClippedPrims += numEmittedPrims;
586
587 // tranpose clipper output so that each lane's vertices are in SIMD order
588 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
589 // for triangle fan
590
591 // transpose pos
592 uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
593
594 #if 0
595 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
596 static const float *dummy = reinterpret_cast<const float *>(pBase);
597
598 #endif
599 for (uint32_t c = 0; c < 4; ++c)
600 {
601 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
602 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
603 pBase += sizeof(typename SIMD_T::Float);
604 }
605
606 // transpose attribs
607 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
608
609 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
610 {
611 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
612
613 for (uint32_t c = 0; c < 4; ++c)
614 {
615 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
616 transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
617 pBase += sizeof(typename SIMD_T::Float);
618 }
619 }
620
621 // transpose user clip distances if enabled
622 uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
623 if (state.backendState.clipDistanceMask & 0x0f)
624 {
625 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
626
627 for (uint32_t c = 0; c < 4; ++c)
628 {
629 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
630 transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
631 pBase += sizeof(typename SIMD_T::Float);
632 }
633 }
634
635 if (state.backendState.clipDistanceMask & 0xf0)
636 {
637 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
638
639 for (uint32_t c = 0; c < 4; ++c)
640 {
641 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
642 transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
643 pBase += sizeof(typename SIMD_T::Float);
644 }
645 }
646
647 PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
648 clipPA.viewportArrayActive = pa.viewportArrayActive;
649 clipPA.rtArrayActive = pa.rtArrayActive;
650
651 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
652
653 const uint32_t primMask = primMaskMap[numEmittedPrims];
654
655 const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
656 const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
657 const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
658
659
660 while (clipPA.GetNextStreamOutput())
661 {
662 do
663 {
664 typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
665
666 bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
667
668 if (assemble)
669 {
670 binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
671 }
672
673 } while (clipPA.NextPrim());
674 }
675 }
676
677 #if defined(_DEBUG)
678 AlignedFree(transposedPrims);
679
680 #endif
681 // update global pipeline stat
682 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
683 }
684
685 void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
686 typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
687 {
688 SWR_ASSERT(pa.pDC != nullptr);
689
690 SWR_CONTEXT *pContext = pa.pDC->pContext;
691
692 BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
693
694 // update clipper invocations pipeline stat
695 uint32_t numInvoc = _mm_popcnt_u32(primMask);
696 UPDATE_STAT_FE(CInvocations, numInvoc);
697
698 ComputeClipCodes(prim, viewportIdx);
699
700 // cull prims with NAN coords
701 primMask &= ~ComputeNaNMask(prim);
702
703 // user cull distance cull
704 if (state.backendState.cullDistanceMask)
705 {
706 primMask &= ~ComputeUserClipCullMask(pa, prim);
707 }
708
709 // cull prims outside view frustum
710 typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
711 int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
712
713 // skip clipping for points
714 uint32_t clipMask = 0;
715 if (NumVertsPerPrim != 1)
716 {
717 clipMask = primMask & ComputeClipMask();
718 }
719
720 if (clipMask)
721 {
722 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
723 // we have to clip tris, execute the clipper, which will also
724 // call the binner
725 ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
726 AR_END(FEGuardbandClip, 1);
727 }
728 else if (validMask)
729 {
730 // update CPrimitives pipeline state
731 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
732
733 // forward valid prims directly to binner
734 binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
735 }
736 }
737
738 private:
739 typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
740 {
741 return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
742 }
743
744 typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
745 {
746 const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
747 const uint32_t componentStride = sizeof(typename SIMD_T::Float);
748 const uint32_t attribStride = sizeof(typename SIMD_T::Vec4);
749
750 static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
751 {
752 0 * sizeof(float),
753 1 * sizeof(float),
754 2 * sizeof(float),
755 3 * sizeof(float),
756 4 * sizeof(float),
757 5 * sizeof(float),
758 6 * sizeof(float),
759 7 * sizeof(float),
760 8 * sizeof(float),
761 9 * sizeof(float),
762 10 * sizeof(float),
763 11 * sizeof(float),
764 12 * sizeof(float),
765 13 * sizeof(float),
766 14 * sizeof(float),
767 15 * sizeof(float),
768 };
769
770 static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
771
772 typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
773
774 // step to the simdvertex
775 typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
776
777 // step to the attribute and component
778 vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
779
780 // step to the lane
781 vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
782
783 return vOffsets;
784 }
785
786 typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
787 {
788 typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
789 typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
790
791 return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
792 }
793
794 void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
795 {
796 typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
797
798 const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
799 const float *pSrc = reinterpret_cast<const float *>(&vSrc);
800 uint32_t mask = SIMD_T::movemask_ps(vMask);
801 DWORD lane;
802 while (_BitScanForward(&lane, mask))
803 {
804 mask &= ~(1 << lane);
805 const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
806 *(float *)pBuf = pSrc[lane];
807 }
808 }
809
810 template<SWR_CLIPCODES ClippingPlane>
811 void intersect(
812 const typename SIMD_T::Float &vActiveMask, // active lanes to operate on
813 const typename SIMD_T::Integer &s, // index to first edge vertex v0 in pInPts.
814 const typename SIMD_T::Integer &p, // index to second edge vertex v1 in pInPts.
815 const typename SIMD_T::Vec4 &v1, // vertex 0 position
816 const typename SIMD_T::Vec4 &v2, // vertex 1 position
817 typename SIMD_T::Integer &outIndex, // output index.
818 const float *pInVerts, // array of all the input positions.
819 uint32_t numInAttribs, // number of attributes per vertex.
820 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
821 {
822 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
823 uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
824
825 // compute interpolation factor
826 typename SIMD_T::Float t;
827 switch (ClippingPlane)
828 {
829 case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
830 case FRUSTUM_RIGHT: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
831 case FRUSTUM_TOP: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
832 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
833 case FRUSTUM_NEAR:
834 // DX Znear plane is 0, GL is -w
835 if (this->state.rastState.clipHalfZ)
836 {
837 t = ComputeInterpFactor(v1[2], v2[2]);
838 }
839 else
840 {
841 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
842 }
843 break;
844 case FRUSTUM_FAR: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
845 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
846 };
847
848 // interpolate position and store
849 for (uint32_t c = 0; c < 4; ++c)
850 {
851 typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
852 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
853 }
854
855 // interpolate attributes and store
856 for (uint32_t a = 0; a < numInAttribs; ++a)
857 {
858 uint32_t attribSlot = vertexAttribOffset + a;
859 for (uint32_t c = 0; c < 4; ++c)
860 {
861 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
862 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
863 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
864 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
865 }
866 }
867
868 // interpolate clip distance if enabled
869 if (this->state.backendState.clipDistanceMask & 0xf)
870 {
871 uint32_t attribSlot = vertexClipCullOffset;
872 for (uint32_t c = 0; c < 4; ++c)
873 {
874 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
875 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
876 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
877 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
878 }
879 }
880
881 if (this->state.backendState.clipDistanceMask & 0xf0)
882 {
883 uint32_t attribSlot = vertexClipCullOffset + 1;
884 for (uint32_t c = 0; c < 4; ++c)
885 {
886 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
887 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
888 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
889 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
890 }
891 }
892 }
893
894 template<SWR_CLIPCODES ClippingPlane>
895 typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
896 {
897 switch (ClippingPlane)
898 {
899 case FRUSTUM_LEFT: return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
900 case FRUSTUM_RIGHT: return SIMD_T::cmple_ps(v[0], v[3]);
901 case FRUSTUM_TOP: return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
902 case FRUSTUM_BOTTOM: return SIMD_T::cmple_ps(v[1], v[3]);
903 case FRUSTUM_NEAR: return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
904 case FRUSTUM_FAR: return SIMD_T::cmple_ps(v[2], v[3]);
905 default:
906 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
907 return SIMD_T::setzero_ps();
908 }
909 }
910
911 template<SWR_CLIPCODES ClippingPlane>
912 typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
913 {
914 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
915
916 typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
917 typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
918 typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
919
920 while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
921 {
922 typename SIMD_T::Integer s = vCurIndex;
923 typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
924 typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
925 p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
926
927 // gather position
928 typename SIMD_T::Vec4 vInPos0, vInPos1;
929 for (uint32_t c = 0; c < 4; ++c)
930 {
931 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
932 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
933 }
934
935 // compute inside mask
936 typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
937 typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
938
939 // compute intersection mask (s_in != p_in)
940 typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
941 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
942
943 // store s if inside
944 s_in = SIMD_T::and_ps(s_in, vActiveMask);
945 if (!SIMD_T::testz_ps(s_in, s_in))
946 {
947 // store position
948 for (uint32_t c = 0; c < 4; ++c)
949 {
950 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
951 }
952
953 // store attribs
954 for (uint32_t a = 0; a < numInAttribs; ++a)
955 {
956 uint32_t attribSlot = vertexAttribOffset + a;
957 for (uint32_t c = 0; c < 4; ++c)
958 {
959 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
960 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
961 }
962 }
963
964 // store clip distance if enabled
965 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
966 if (this->state.backendState.clipDistanceMask & 0xf)
967 {
968 uint32_t attribSlot = vertexClipCullSlot;
969 for (uint32_t c = 0; c < 4; ++c)
970 {
971 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
972 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
973 }
974 }
975
976 if (this->state.backendState.clipDistanceMask & 0xf0)
977 {
978 uint32_t attribSlot = vertexClipCullSlot + 1;
979 for (uint32_t c = 0; c < 4; ++c)
980 {
981 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
982 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
983 }
984 }
985
986 // increment outIndex
987 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
988 }
989
990 // compute and store intersection
991 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
992 {
993 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
994
995 // increment outIndex for active lanes
996 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
997 }
998
999 // increment loop index and update active mask
1000 vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1001 vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1002 }
1003
1004 return vOutIndex;
1005 }
1006
1007 template<SWR_CLIPCODES ClippingPlane>
1008 typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1009 {
1010 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1011
1012 typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
1013 typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
1014 typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1015
1016 if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1017 {
1018 typename SIMD_T::Integer s = vCurIndex;
1019 typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1020
1021 // gather position
1022 typename SIMD_T::Vec4 vInPos0, vInPos1;
1023 for (uint32_t c = 0; c < 4; ++c)
1024 {
1025 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1026 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1027 }
1028
1029 // compute inside mask
1030 typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
1031 typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
1032
1033 // compute intersection mask (s_in != p_in)
1034 typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
1035 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1036
1037 // store s if inside
1038 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1039 if (!SIMD_T::testz_ps(s_in, s_in))
1040 {
1041 for (uint32_t c = 0; c < 4; ++c)
1042 {
1043 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1044 }
1045
1046 // interpolate attributes and store
1047 for (uint32_t a = 0; a < numInAttribs; ++a)
1048 {
1049 uint32_t attribSlot = vertexAttribOffset + a;
1050 for (uint32_t c = 0; c < 4; ++c)
1051 {
1052 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1053 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1054 }
1055 }
1056
1057 // increment outIndex
1058 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1059 }
1060
1061 // compute and store intersection
1062 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1063 {
1064 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1065
1066 // increment outIndex for active lanes
1067 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1068 }
1069
1070 // store p if inside
1071 p_in = SIMD_T::and_ps(p_in, vActiveMask);
1072 if (!SIMD_T::testz_ps(p_in, p_in))
1073 {
1074 for (uint32_t c = 0; c < 4; ++c)
1075 {
1076 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1077 }
1078
1079 // interpolate attributes and store
1080 for (uint32_t a = 0; a < numInAttribs; ++a)
1081 {
1082 uint32_t attribSlot = vertexAttribOffset + a;
1083 for (uint32_t c = 0; c < 4; ++c)
1084 {
1085 typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1086 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1087 }
1088 }
1089
1090 // increment outIndex
1091 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1092 }
1093 }
1094
1095 return vOutIndex;
1096 }
1097
1098 typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
1099 {
1100 // temp storage
1101 float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1102
1103 // zero out num input verts for non-active lanes
1104 typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1105 vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1106
1107 // clip prims to frustum
1108 typename SIMD_T::Integer vNumOutPts;
1109 if (NumVertsPerPrim == 3)
1110 {
1111 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1112 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1113 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1114 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1115 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1116 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1117 }
1118 else
1119 {
1120 SWR_ASSERT(NumVertsPerPrim == 2);
1121 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1122 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1123 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1124 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1125 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1126 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1127 }
1128
1129 // restore num verts for non-clipped, active lanes
1130 typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1131 vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1132
1133 return vNumOutPts;
1134 }
1135
1136 const uint32_t workerId{ 0 };
1137 DRAW_CONTEXT *pDC{ nullptr };
1138 const API_STATE &state;
1139 typename SIMD_T::Float clipCodes[NumVertsPerPrim];
1140 };
1141
1142
1143 // pipeline stage functions
1144 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1145 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1146 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1147 #if USE_SIMD16_FRONTEND
1148 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1149 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1150 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1151 #endif
1152