7b4ed58c3fa28cca4c1021da71ca8af0da7f8cca
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
39 #endif
40
41 enum SWR_CLIPCODES
42 {
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
45 // rather than intersection, of clipcodes.
46 #define CLIPCODE_SHIFT 23
47 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
48 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
49 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
50 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
51
52 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
53 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
54
55 NEGW = (0x40 << CLIPCODE_SHIFT),
56
57 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
58 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
59 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
60 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
61 };
62
63 #define GUARDBAND_CLIP_MASK \
64 (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
65 GUARDBAND_BOTTOM | NEGW)
66 #define FRUSTUM_CLIP_MASK \
67 (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
68
69 template <typename SIMD_T>
70 void ComputeClipCodes(const API_STATE& state,
71 const Vec4<SIMD_T>& vertex,
72 Float<SIMD_T>& clipCodes,
73 Integer<SIMD_T> const& viewportIndexes)
74 {
75 clipCodes = SIMD_T::setzero_ps();
76
77 // -w
78 Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
79
80 // FRUSTUM_LEFT
81 Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
82 clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
83
84 // FRUSTUM_TOP
85 vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
86 clipCodes = SIMD_T::or_ps(
87 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
88
89 // FRUSTUM_RIGHT
90 vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
91 clipCodes = SIMD_T::or_ps(
92 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
93
94 // FRUSTUM_BOTTOM
95 vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
96 clipCodes = SIMD_T::or_ps(
97 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
98
99 if (state.rastState.depthClipEnable)
100 {
101 // FRUSTUM_NEAR
102 // DX clips depth [0..w], GL clips [-w..w]
103 if (state.rastState.clipHalfZ)
104 {
105 vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
106 }
107 else
108 {
109 vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
110 }
111 clipCodes = SIMD_T::or_ps(
112 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
113
114 // FRUSTUM_FAR
115 vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
116 clipCodes = SIMD_T::or_ps(
117 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
118 }
119
120 // NEGW
121 vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
122 clipCodes =
123 SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
124
125 // GUARDBAND_LEFT
126 Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
127 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
128 &state.gbState.left[0], viewportIndexes));
129 vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
130 clipCodes = SIMD_T::or_ps(
131 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
132
133 // GUARDBAND_TOP
134 gbMult = SIMD_T::mul_ps(vNegW,
135 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
136 &state.gbState.top[0], viewportIndexes));
137 vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
138 clipCodes = SIMD_T::or_ps(
139 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
140
141 // GUARDBAND_RIGHT
142 gbMult = SIMD_T::mul_ps(vertex.w,
143 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
144 &state.gbState.right[0], viewportIndexes));
145 vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
146 clipCodes = SIMD_T::or_ps(
147 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
148
149 // GUARDBAND_BOTTOM
150 gbMult = SIMD_T::mul_ps(vertex.w,
151 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
152 &state.gbState.bottom[0], viewportIndexes));
153 vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
154 clipCodes = SIMD_T::or_ps(
155 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
156 }
157
158 template <typename SIMD_T>
159 struct BinnerChooser
160 {
161 };
162
163 template <>
164 struct BinnerChooser<SIMD256>
165 {
166 PFN_PROCESS_PRIMS pfnBinFunc;
167
168 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
169 :
170 pfnBinFunc(nullptr)
171 {
172 if (numVertsPerPrim == 3)
173 {
174 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
175
176 }
177 else if (numVertsPerPrim == 2)
178 {
179 pfnBinFunc = BinLines;
180 }
181 else
182 {
183 SWR_ASSERT(0 && "Unexpected points in clipper.");
184 }
185 }
186
187 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
188 :
189 pfnBinFunc(nullptr)
190 {
191 switch (topology)
192 {
193 case TOP_POINT_LIST:
194 pfnBinFunc = BinPoints;
195 break;
196 case TOP_LINE_LIST:
197 case TOP_LINE_STRIP:
198 case TOP_LINE_LOOP:
199 case TOP_LINE_LIST_ADJ:
200 case TOP_LISTSTRIP_ADJ:
201 pfnBinFunc = BinLines;
202 break;
203 default:
204 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
205 break;
206 };
207 }
208
209 void BinFunc(DRAW_CONTEXT* pDC,
210 PA_STATE& pa,
211 uint32_t workerId,
212 SIMD256::Vec4 prims[],
213 uint32_t primMask,
214 SIMD256::Integer const& primID,
215 SIMD256::Integer& viewportIdx,
216 SIMD256::Integer& rtIdx)
217 {
218 SWR_ASSERT(pfnBinFunc != nullptr);
219
220 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
221 }
222 };
223
224 #if USE_SIMD16_FRONTEND
225 template <>
226 struct BinnerChooser<SIMD512>
227 {
228 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
229
230 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
231 :
232 pfnBinFunc(nullptr)
233 {
234 if (numVertsPerPrim == 3)
235 {
236 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
237
238 }
239 else if (numVertsPerPrim == 2)
240 {
241 pfnBinFunc = BinLines_simd16;
242 }
243 else
244 {
245 SWR_ASSERT(0 && "Unexpected points in clipper.");
246 }
247 }
248
249 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
250 :
251 pfnBinFunc(nullptr)
252 {
253 switch (topology)
254 {
255 case TOP_POINT_LIST:
256 pfnBinFunc = BinPoints_simd16;
257 break;
258 case TOP_LINE_LIST:
259 case TOP_LINE_STRIP:
260 case TOP_LINE_LOOP:
261 case TOP_LINE_LIST_ADJ:
262 case TOP_LISTSTRIP_ADJ:
263 pfnBinFunc = BinLines_simd16;
264 break;
265 default:
266 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
267 break;
268 };
269 }
270
271 void BinFunc(DRAW_CONTEXT* pDC,
272 PA_STATE& pa,
273 uint32_t workerId,
274 SIMD512::Vec4 prims[],
275 uint32_t primMask,
276 SIMD512::Integer const& primID,
277 SIMD512::Integer& viewportIdx,
278 SIMD512::Integer& rtIdx)
279 {
280 SWR_ASSERT(pfnBinFunc != nullptr);
281
282 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
283 }
284 };
285
286 #endif
287 template <typename SIMD_T>
288 struct SimdHelper
289 {
290 };
291
292 template <>
293 struct SimdHelper<SIMD256>
294 {
295 static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
296
297 static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
298 {
299 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
300 }
301 };
302
303 #if USE_SIMD16_FRONTEND
304 template <>
305 struct SimdHelper<SIMD512>
306 {
307 static SIMD512::Float insert_lo_ps(SIMD256::Float a)
308 {
309 return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
310 }
311
312 static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
313 {
314 return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
315 }
316 };
317
318 #endif
319 // Temp storage used by the clipper
320 template <typename SIMD_T>
321 struct ClipHelper
322 {
323 };
324
325 template <>
326 struct ClipHelper<SIMD256>
327 {
328 static SIMDVERTEX_T<SIMD256>* GetTempVertices() { return tlsTempVertices; }
329 };
330
331 #if USE_SIMD16_FRONTEND
332 template <>
333 struct ClipHelper<SIMD512>
334 {
335 static SIMDVERTEX_T<SIMD512>* GetTempVertices() { return tlsTempVertices_simd16; }
336 };
337
338 #endif
339 template <typename SIMD_T, uint32_t NumVertsPerPrim>
340 class Clipper
341 {
342 public:
343 INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
344 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
345 {
346 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
347 }
348
349 void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
350 {
351 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
352 {
353 ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
354 }
355 }
356
357 Float<SIMD_T> ComputeClipCodeIntersection()
358 {
359 Float<SIMD_T> result = clipCodes[0];
360
361 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
362 {
363 result = SIMD_T::and_ps(result, clipCodes[i]);
364 }
365
366 return result;
367 }
368
369 Float<SIMD_T> ComputeClipCodeUnion()
370 {
371 Float<SIMD_T> result = clipCodes[0];
372
373 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
374 {
375 result = SIMD_T::or_ps(result, clipCodes[i]);
376 }
377
378 return result;
379 }
380
381 int ComputeClipMask()
382 {
383 Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
384
385 clipUnion =
386 SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
387
388 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
389 }
390
391 // clipper is responsible for culling any prims with NAN coordinates
392 int ComputeNaNMask(Vec4<SIMD_T> prim[])
393 {
394 Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
395
396 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
397 {
398 Float<SIMD_T> vNan01 =
399 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
400 vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
401
402 Float<SIMD_T> vNan23 =
403 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
404 vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
405 }
406
407 return SIMD_T::movemask_ps(vNanMask);
408 }
409
410 int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
411 {
412 uint8_t cullMask = state.backendState.cullDistanceMask;
413 uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
414
415 Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
416
417 Vec4<SIMD_T> vClipCullDistLo[3];
418 Vec4<SIMD_T> vClipCullDistHi[3];
419
420 pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
421 pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
422
423 DWORD index;
424 while (_BitScanForward(&index, cullMask))
425 {
426 cullMask &= ~(1 << index);
427 uint32_t slot = index >> 2;
428 uint32_t component = index & 0x3;
429
430 Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
431 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
432 {
433 Float<SIMD_T> vCullComp;
434 if (slot == 0)
435 {
436 vCullComp = vClipCullDistLo[e][component];
437 }
438 else
439 {
440 vCullComp = vClipCullDistHi[e][component];
441 }
442
443 // cull if cull distance < 0 || NAN
444 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
445 SIMD_T::setzero_ps(), vCullComp);
446 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
447 }
448 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
449 }
450
451 // clipper should also discard any primitive with NAN clip distance
452 uint8_t clipMask = state.backendState.clipDistanceMask;
453 while (_BitScanForward(&index, clipMask))
454 {
455 clipMask &= ~(1 << index);
456 uint32_t slot = index >> 2;
457 uint32_t component = index & 0x3;
458
459 Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
460 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
461 {
462 Float<SIMD_T> vClipComp;
463 if (slot == 0)
464 {
465 vClipComp = vClipCullDistLo[e][component];
466 }
467 else
468 {
469 vClipComp = vClipCullDistHi[e][component];
470 }
471
472 Float<SIMD_T> vClip =
473 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
474 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
475 SIMD_T::setzero_ps(), vClipComp);
476 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
477 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
478 }
479 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
480 }
481
482 return SIMD_T::movemask_ps(vClipCullMask);
483 }
484
485 void ClipSimd(const Vec4<SIMD_T> prim[],
486 const Float<SIMD_T>& vPrimMask,
487 const Float<SIMD_T>& vClipMask,
488 PA_STATE& pa,
489 const Integer<SIMD_T>& vPrimId,
490 const Integer<SIMD_T>& vViewportIdx,
491 const Integer<SIMD_T>& vRtIdx)
492 {
493 // input/output vertex store for clipper
494 SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
495
496 uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
497 uint32_t provokingVertex = 0;
498 if (pa.binTopology == TOP_TRIANGLE_FAN)
499 {
500 provokingVertex = state.frontendState.provokingVertex.triFan;
501 }
502 ///@todo: line topology for wireframe?
503
504 // assemble pos
505 Vec4<SIMD_T> tmpVector[NumVertsPerPrim];
506 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
507 {
508 vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
509 }
510
511 // assemble attribs
512 const SWR_BACKEND_STATE& backendState = state.backendState;
513
514 int32_t maxSlot = -1;
515 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
516 {
517 // Compute absolute attrib slot in vertex array
518 uint32_t mapSlot =
519 backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
520 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
521 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
522
523 pa.Assemble(inputSlot, tmpVector);
524
525 // if constant interpolation enabled for this attribute, assign the provoking
526 // vertex values to all edges
527 if (CheckBit(constantInterpMask, slot))
528 {
529 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
530 {
531 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
532 }
533 }
534 else
535 {
536 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
537 {
538 vertices[i].attrib[inputSlot] = tmpVector[i];
539 }
540 }
541 }
542
543 // assemble user clip distances if enabled
544 uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
545 if (state.backendState.clipDistanceMask & 0xf)
546 {
547 pa.Assemble(vertexClipCullSlot, tmpVector);
548 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
549 {
550 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
551 }
552 }
553
554 if (state.backendState.clipDistanceMask & 0xf0)
555 {
556 pa.Assemble(vertexClipCullSlot + 1, tmpVector);
557 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
558 {
559 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
560 }
561 }
562
563 uint32_t numAttribs = maxSlot + 1;
564
565 Integer<SIMD_T> vNumClippedVerts =
566 ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
567
568 BinnerChooser<SIMD_T> binner(NumVertsPerPrim,
569 pa.pDC->pState->state.rastState.conservativeRast);
570
571 // set up new PA for binning clipped primitives
572 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
573 if (NumVertsPerPrim == 3)
574 {
575 clipTopology = TOP_TRIANGLE_FAN;
576
577 // so that the binner knows to bloat wide points later
578 if (pa.binTopology == TOP_POINT_LIST)
579 {
580 clipTopology = TOP_POINT_LIST;
581 }
582 else if (pa.binTopology == TOP_RECT_LIST)
583 {
584 clipTopology = TOP_RECT_LIST;
585 }
586 }
587 else if (NumVertsPerPrim == 2)
588 {
589 clipTopology = TOP_LINE_LIST;
590 }
591 else
592 {
593 SWR_ASSERT(0 && "Unexpected points in clipper.");
594 }
595
596 const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
597 const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
598 const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
599 const uint32_t* pRtIdx = reinterpret_cast<const uint32_t*>(&vRtIdx);
600
601 const SIMD256::Integer vOffsets =
602 SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
603 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
604 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
605 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
606 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
607 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
608 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
609 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
610
611 // only need to gather 7 verts
612 // @todo dynamic mask based on actual # of verts generated per lane
613 const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
614
615 uint32_t numClippedPrims = 0;
616
617 // tranpose clipper output so that each lane's vertices are in SIMD order
618 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
619 // for triangle fan
620
621 #if defined(_DEBUG)
622 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack
623 // overflow in debug builds
624 SIMDVERTEX_T<SIMD_T>* transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T>*>(
625 AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
626
627 #else
628 SIMDVERTEX_T<SIMD_T> transposedPrims[2];
629
630 #endif
631 uint32_t numInputPrims = pa.NumPrims();
632 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
633 {
634 uint32_t numEmittedVerts = pVertexCount[inputPrim];
635 if (numEmittedVerts < NumVertsPerPrim)
636 {
637 continue;
638 }
639 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
640
641 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
642 SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
643
644 numClippedPrims += numEmittedPrims;
645
646 // tranpose clipper output so that each lane's vertices are in SIMD order
647 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
648 // for triangle fan
649
650 // transpose pos
651 uint8_t* pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
652 sizeof(float) * inputPrim;
653
654 #if 0
655 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
656 static const float *dummy = reinterpret_cast<const float *>(pBase);
657
658 #endif
659 for (uint32_t c = 0; c < 4; ++c)
660 {
661 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
662 SIMD256::setzero_ps(), reinterpret_cast<const float*>(pBase), vOffsets, vMask);
663 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
664 SimdHelper<SIMD_T>::insert_lo_ps(temp);
665 pBase += sizeof(Float<SIMD_T>);
666 }
667
668 // transpose attribs
669 pBase =
670 reinterpret_cast<uint8_t*>(&vertices[0].attrib[backendState.vertexAttribOffset]) +
671 sizeof(float) * inputPrim;
672
673 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
674 {
675 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
676
677 for (uint32_t c = 0; c < 4; ++c)
678 {
679 SIMD256::Float temp =
680 SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
681 SIMD256::setzero_ps(),
682 reinterpret_cast<const float*>(pBase),
683 vOffsets,
684 vMask);
685 transposedPrims[0].attrib[attribSlot][c] =
686 SimdHelper<SIMD_T>::insert_lo_ps(temp);
687 pBase += sizeof(Float<SIMD_T>);
688 }
689 }
690
691 // transpose user clip distances if enabled
692 uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
693 if (state.backendState.clipDistanceMask & 0x0f)
694 {
695 pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot]) +
696 sizeof(float) * inputPrim;
697
698 for (uint32_t c = 0; c < 4; ++c)
699 {
700 SIMD256::Float temp =
701 SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
702 SIMD256::setzero_ps(),
703 reinterpret_cast<const float*>(pBase),
704 vOffsets,
705 vMask);
706 transposedPrims[0].attrib[vertexClipCullSlot][c] =
707 SimdHelper<SIMD_T>::insert_lo_ps(temp);
708 pBase += sizeof(Float<SIMD_T>);
709 }
710 }
711
712 if (state.backendState.clipDistanceMask & 0xf0)
713 {
714 pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
715 sizeof(float) * inputPrim;
716
717 for (uint32_t c = 0; c < 4; ++c)
718 {
719 SIMD256::Float temp =
720 SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
721 SIMD256::setzero_ps(),
722 reinterpret_cast<const float*>(pBase),
723 vOffsets,
724 vMask);
725 transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
726 SimdHelper<SIMD_T>::insert_lo_ps(temp);
727 pBase += sizeof(Float<SIMD_T>);
728 }
729 }
730
731 PA_STATE_OPT clipPA(pDC,
732 numEmittedPrims,
733 reinterpret_cast<uint8_t*>(&transposedPrims[0]),
734 numEmittedVerts,
735 SWR_VTX_NUM_SLOTS,
736 true,
737 NumVertsPerPrim,
738 clipTopology);
739 clipPA.viewportArrayActive = pa.viewportArrayActive;
740 clipPA.rtArrayActive = pa.rtArrayActive;
741
742 static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
743
744 const uint32_t primMask = primMaskMap[numEmittedPrims];
745
746 const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
747 const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
748 const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
749
750 while (clipPA.GetNextStreamOutput())
751 {
752 do
753 {
754 Vec4<SIMD_T> attrib[NumVertsPerPrim];
755
756 bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
757
758 if (assemble)
759 {
760 binner.pfnBinFunc(
761 pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
762 }
763
764 } while (clipPA.NextPrim());
765 }
766 }
767
768 #if defined(_DEBUG)
769 AlignedFree(transposedPrims);
770
771 #endif
772 // update global pipeline stat
773 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
774 }
775
776 void ExecuteStage(PA_STATE& pa,
777 Vec4<SIMD_T> prim[],
778 uint32_t primMask,
779 Integer<SIMD_T> const& primId,
780 Integer<SIMD_T> const& viewportIdx,
781 Integer<SIMD_T> const& rtIdx)
782 {
783 SWR_ASSERT(pa.pDC != nullptr);
784
785 BinnerChooser<SIMD_T> binner(pa.binTopology,
786 pa.pDC->pState->state.rastState.conservativeRast);
787
788 // update clipper invocations pipeline stat
789 uint32_t numInvoc = _mm_popcnt_u32(primMask);
790 UPDATE_STAT_FE(CInvocations, numInvoc);
791
792 ComputeClipCodes(prim, viewportIdx);
793
794 // cull prims with NAN coords
795 primMask &= ~ComputeNaNMask(prim);
796
797 // user cull distance cull
798 if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
799 {
800 primMask &= ~ComputeUserClipCullMask(pa, prim);
801 }
802
803 Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
804 // Mask out non-frustum codes
805 clipIntersection = SIMD_T::and_ps(clipIntersection,
806 SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
807
808 // cull prims outside view frustum
809 int validMask =
810 primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
811
812 // skip clipping for points
813 uint32_t clipMask = 0;
814 if (NumVertsPerPrim != 1)
815 {
816 clipMask = validMask & ComputeClipMask();
817 }
818
819 AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
820
821 if (clipMask)
822 {
823 RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
824 // we have to clip tris, execute the clipper, which will also
825 // call the binner
826 ClipSimd(prim,
827 SIMD_T::vmask_ps(validMask),
828 SIMD_T::vmask_ps(clipMask),
829 pa,
830 primId,
831 viewportIdx,
832 rtIdx);
833 RDTSC_END(FEGuardbandClip, 1);
834 }
835 else if (validMask)
836 {
837 // update CPrimitives pipeline state
838 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
839
840 // forward valid prims directly to binner
841 binner.pfnBinFunc(
842 this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
843 }
844 }
845
846 private:
847 Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
848 Float<SIMD_T> const& boundaryCoord1)
849 {
850 return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
851 }
852
853 Integer<SIMD_T>
854 ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
855 {
856 const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
857 const uint32_t componentStride = sizeof(Float<SIMD_T>);
858 const uint32_t attribStride = sizeof(Vec4<SIMD_T>);
859
860 static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
861 0 * sizeof(float),
862 1 * sizeof(float),
863 2 * sizeof(float),
864 3 * sizeof(float),
865 4 * sizeof(float),
866 5 * sizeof(float),
867 6 * sizeof(float),
868 7 * sizeof(float),
869 8 * sizeof(float),
870 9 * sizeof(float),
871 10 * sizeof(float),
872 11 * sizeof(float),
873 12 * sizeof(float),
874 13 * sizeof(float),
875 14 * sizeof(float),
876 15 * sizeof(float),
877 };
878
879 static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
880 "Clipper::ComputeOffsets, Increase number of element offsets.");
881
882 Integer<SIMD_T> vElemOffset =
883 SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
884
885 // step to the simdvertex
886 Integer<SIMD_T> vOffsets =
887 SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
888
889 // step to the attribute and component
890 vOffsets = SIMD_T::add_epi32(
891 vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
892
893 // step to the lane
894 vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
895
896 return vOffsets;
897 }
898
899 Float<SIMD_T> GatherComponent(const float* pBuffer,
900 uint32_t attrib,
901 Float<SIMD_T> const& vMask,
902 Integer<SIMD_T> const& vIndices,
903 uint32_t component)
904 {
905 Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
906 Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
907
908 return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
909 vSrc, pBuffer, vOffsets, vMask);
910 }
911
912 void ScatterComponent(const float* pBuffer,
913 uint32_t attrib,
914 Float<SIMD_T> const& vMask,
915 Integer<SIMD_T> const& vIndices,
916 uint32_t component,
917 Float<SIMD_T> const& vSrc)
918 {
919 Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
920
921 const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
922 const float* pSrc = reinterpret_cast<const float*>(&vSrc);
923 uint32_t mask = SIMD_T::movemask_ps(vMask);
924 DWORD lane;
925 while (_BitScanForward(&lane, mask))
926 {
927 mask &= ~(1 << lane);
928 const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
929 *(float*)pBuf = pSrc[lane];
930 }
931 }
932
933 template <SWR_CLIPCODES ClippingPlane>
934 void intersect(const Float<SIMD_T>& vActiveMask, // active lanes to operate on
935 const Integer<SIMD_T>& s, // index to first edge vertex v0 in pInPts.
936 const Integer<SIMD_T>& p, // index to second edge vertex v1 in pInPts.
937 const Vec4<SIMD_T>& v1, // vertex 0 position
938 const Vec4<SIMD_T>& v2, // vertex 1 position
939 Integer<SIMD_T>& outIndex, // output index.
940 const float* pInVerts, // array of all the input positions.
941 uint32_t numInAttribs, // number of attributes per vertex.
942 float* pOutVerts) // array of output positions. We'll write our new intersection
943 // point at i*4.
944 {
945 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
946 uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
947
948 // compute interpolation factor
949 Float<SIMD_T> t;
950 switch (ClippingPlane)
951 {
952 case FRUSTUM_LEFT:
953 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
954 break;
955 case FRUSTUM_RIGHT:
956 t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
957 break;
958 case FRUSTUM_TOP:
959 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
960 break;
961 case FRUSTUM_BOTTOM:
962 t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
963 break;
964 case FRUSTUM_NEAR:
965 // DX Znear plane is 0, GL is -w
966 if (this->state.rastState.clipHalfZ)
967 {
968 t = ComputeInterpFactor(v1[2], v2[2]);
969 }
970 else
971 {
972 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
973 }
974 break;
975 case FRUSTUM_FAR:
976 t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
977 break;
978 default:
979 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
980 };
981
982 // interpolate position and store
983 for (uint32_t c = 0; c < 4; ++c)
984 {
985 Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
986 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
987 }
988
989 // interpolate attributes and store
990 for (uint32_t a = 0; a < numInAttribs; ++a)
991 {
992 uint32_t attribSlot = vertexAttribOffset + a;
993 for (uint32_t c = 0; c < 4; ++c)
994 {
995 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
996 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
997 Float<SIMD_T> vOutAttrib =
998 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
999 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1000 }
1001 }
1002
1003 // interpolate clip distance if enabled
1004 if (this->state.backendState.clipDistanceMask & 0xf)
1005 {
1006 uint32_t attribSlot = vertexClipCullOffset;
1007 for (uint32_t c = 0; c < 4; ++c)
1008 {
1009 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1010 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1011 Float<SIMD_T> vOutAttrib =
1012 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1013 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1014 }
1015 }
1016
1017 if (this->state.backendState.clipDistanceMask & 0xf0)
1018 {
1019 uint32_t attribSlot = vertexClipCullOffset + 1;
1020 for (uint32_t c = 0; c < 4; ++c)
1021 {
1022 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1023 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1024 Float<SIMD_T> vOutAttrib =
1025 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1026 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1027 }
1028 }
1029 }
1030
1031 template <SWR_CLIPCODES ClippingPlane>
1032 Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
1033 {
1034 switch (ClippingPlane)
1035 {
1036 case FRUSTUM_LEFT:
1037 return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1038 case FRUSTUM_RIGHT:
1039 return SIMD_T::cmple_ps(v[0], v[3]);
1040 case FRUSTUM_TOP:
1041 return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1042 case FRUSTUM_BOTTOM:
1043 return SIMD_T::cmple_ps(v[1], v[3]);
1044 case FRUSTUM_NEAR:
1045 return SIMD_T::cmpge_ps(v[2],
1046 this->state.rastState.clipHalfZ
1047 ? SIMD_T::setzero_ps()
1048 : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1049 case FRUSTUM_FAR:
1050 return SIMD_T::cmple_ps(v[2], v[3]);
1051 default:
1052 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1053 return SIMD_T::setzero_ps();
1054 }
1055 }
1056
1057 template <SWR_CLIPCODES ClippingPlane>
1058 Integer<SIMD_T> ClipTriToPlane(const float* pInVerts,
1059 const Integer<SIMD_T>& vNumInPts,
1060 uint32_t numInAttribs,
1061 float* pOutVerts)
1062 {
1063 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1064
1065 Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
1066 Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
1067 Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1068
1069 while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1070 {
1071 Integer<SIMD_T> s = vCurIndex;
1072 Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1073 Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
1074 p = SIMD_T::castps_si(SIMD_T::blendv_ps(
1075 SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
1076
1077 // gather position
1078 Vec4<SIMD_T> vInPos0, vInPos1;
1079 for (uint32_t c = 0; c < 4; ++c)
1080 {
1081 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1082 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1083 }
1084
1085 // compute inside mask
1086 Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1087 Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1088
1089 // compute intersection mask (s_in != p_in)
1090 Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1091 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1092
1093 // store s if inside
1094 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1095 if (!SIMD_T::testz_ps(s_in, s_in))
1096 {
1097 // store position
1098 for (uint32_t c = 0; c < 4; ++c)
1099 {
1100 ScatterComponent(
1101 pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1102 }
1103
1104 // store attribs
1105 for (uint32_t a = 0; a < numInAttribs; ++a)
1106 {
1107 uint32_t attribSlot = vertexAttribOffset + a;
1108 for (uint32_t c = 0; c < 4; ++c)
1109 {
1110 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1111 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1112 }
1113 }
1114
1115 // store clip distance if enabled
1116 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
1117 if (this->state.backendState.clipDistanceMask & 0xf)
1118 {
1119 uint32_t attribSlot = vertexClipCullSlot;
1120 for (uint32_t c = 0; c < 4; ++c)
1121 {
1122 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1123 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1124 }
1125 }
1126
1127 if (this->state.backendState.clipDistanceMask & 0xf0)
1128 {
1129 uint32_t attribSlot = vertexClipCullSlot + 1;
1130 for (uint32_t c = 0; c < 4; ++c)
1131 {
1132 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1133 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1134 }
1135 }
1136
1137 // increment outIndex
1138 vOutIndex = SIMD_T::blendv_epi32(
1139 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1140 }
1141
1142 // compute and store intersection
1143 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1144 {
1145 intersect<ClippingPlane>(intersectMask,
1146 s,
1147 p,
1148 vInPos0,
1149 vInPos1,
1150 vOutIndex,
1151 pInVerts,
1152 numInAttribs,
1153 pOutVerts);
1154
1155 // increment outIndex for active lanes
1156 vOutIndex = SIMD_T::blendv_epi32(
1157 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1158 }
1159
1160 // increment loop index and update active mask
1161 vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1162 vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1163 }
1164
1165 return vOutIndex;
1166 }
1167
1168 template <SWR_CLIPCODES ClippingPlane>
1169 Integer<SIMD_T> ClipLineToPlane(const float* pInVerts,
1170 const Integer<SIMD_T>& vNumInPts,
1171 uint32_t numInAttribs,
1172 float* pOutVerts)
1173 {
1174 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1175
1176 Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
1177 Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
1178 Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1179
1180 if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1181 {
1182 Integer<SIMD_T> s = vCurIndex;
1183 Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1184
1185 // gather position
1186 Vec4<SIMD_T> vInPos0, vInPos1;
1187 for (uint32_t c = 0; c < 4; ++c)
1188 {
1189 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1190 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1191 }
1192
1193 // compute inside mask
1194 Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1195 Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1196
1197 // compute intersection mask (s_in != p_in)
1198 Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1199 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1200
1201 // store s if inside
1202 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1203 if (!SIMD_T::testz_ps(s_in, s_in))
1204 {
1205 for (uint32_t c = 0; c < 4; ++c)
1206 {
1207 ScatterComponent(
1208 pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1209 }
1210
1211 // interpolate attributes and store
1212 for (uint32_t a = 0; a < numInAttribs; ++a)
1213 {
1214 uint32_t attribSlot = vertexAttribOffset + a;
1215 for (uint32_t c = 0; c < 4; ++c)
1216 {
1217 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1218 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1219 }
1220 }
1221
1222 // increment outIndex
1223 vOutIndex = SIMD_T::blendv_epi32(
1224 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1225 }
1226
1227 // compute and store intersection
1228 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1229 {
1230 intersect<ClippingPlane>(intersectMask,
1231 s,
1232 p,
1233 vInPos0,
1234 vInPos1,
1235 vOutIndex,
1236 pInVerts,
1237 numInAttribs,
1238 pOutVerts);
1239
1240 // increment outIndex for active lanes
1241 vOutIndex = SIMD_T::blendv_epi32(
1242 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1243 }
1244
1245 // store p if inside
1246 p_in = SIMD_T::and_ps(p_in, vActiveMask);
1247 if (!SIMD_T::testz_ps(p_in, p_in))
1248 {
1249 for (uint32_t c = 0; c < 4; ++c)
1250 {
1251 ScatterComponent(
1252 pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1253 }
1254
1255 // interpolate attributes and store
1256 for (uint32_t a = 0; a < numInAttribs; ++a)
1257 {
1258 uint32_t attribSlot = vertexAttribOffset + a;
1259 for (uint32_t c = 0; c < 4; ++c)
1260 {
1261 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1262 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1263 }
1264 }
1265
1266 // increment outIndex
1267 vOutIndex = SIMD_T::blendv_epi32(
1268 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1269 }
1270 }
1271
1272 return vOutIndex;
1273 }
1274
1275 Integer<SIMD_T> ClipPrims(float* pVertices,
1276 const Float<SIMD_T>& vPrimMask,
1277 const Float<SIMD_T>& vClipMask,
1278 int numAttribs)
1279 {
1280 // temp storage
1281 float* pTempVerts = reinterpret_cast<float*>(ClipHelper<SIMD_T>::GetTempVertices());
1282
1283 // zero out num input verts for non-active lanes
1284 Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1285 vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1286
1287 // clip prims to frustum
1288 Integer<SIMD_T> vNumOutPts;
1289 if (NumVertsPerPrim == 3)
1290 {
1291 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1292 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1293 vNumOutPts =
1294 ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1295 vNumOutPts =
1296 ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1297 vNumOutPts =
1298 ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1299 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1300 }
1301 else
1302 {
1303 SWR_ASSERT(NumVertsPerPrim == 2);
1304 vNumOutPts =
1305 ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1306 vNumOutPts =
1307 ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1308 vNumOutPts =
1309 ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1310 vNumOutPts =
1311 ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1312 vNumOutPts =
1313 ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1314 vNumOutPts =
1315 ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1316 }
1317
1318 // restore num verts for non-clipped, active lanes
1319 Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1320 vNumOutPts =
1321 SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1322
1323 return vNumOutPts;
1324 }
1325
1326 const uint32_t workerId{0};
1327 DRAW_CONTEXT* pDC{nullptr};
1328 const API_STATE& state;
1329 Float<SIMD_T> clipCodes[NumVertsPerPrim];
1330 };
1331
1332 // pipeline stage functions
1333 void ClipRectangles(DRAW_CONTEXT* pDC,
1334 PA_STATE& pa,
1335 uint32_t workerId,
1336 simdvector prims[],
1337 uint32_t primMask,
1338 simdscalari const& primId,
1339 simdscalari const& viewportIdx,
1340 simdscalari const& rtIdx);
1341 void ClipTriangles(DRAW_CONTEXT* pDC,
1342 PA_STATE& pa,
1343 uint32_t workerId,
1344 simdvector prims[],
1345 uint32_t primMask,
1346 simdscalari const& primId,
1347 simdscalari const& viewportIdx,
1348 simdscalari const& rtIdx);
1349 void ClipLines(DRAW_CONTEXT* pDC,
1350 PA_STATE& pa,
1351 uint32_t workerId,
1352 simdvector prims[],
1353 uint32_t primMask,
1354 simdscalari const& primId,
1355 simdscalari const& viewportIdx,
1356 simdscalari const& rtIdx);
1357 void ClipPoints(DRAW_CONTEXT* pDC,
1358 PA_STATE& pa,
1359 uint32_t workerId,
1360 simdvector prims[],
1361 uint32_t primMask,
1362 simdscalari const& primId,
1363 simdscalari const& viewportIdx,
1364 simdscalari const& rtIdx);
1365 #if USE_SIMD16_FRONTEND
1366 void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC,
1367 PA_STATE& pa,
1368 uint32_t workerId,
1369 simd16vector prims[],
1370 uint32_t primMask,
1371 simd16scalari const& primId,
1372 simd16scalari const& viewportIdx,
1373 simd16scalari const& rtIdx);
1374 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC,
1375 PA_STATE& pa,
1376 uint32_t workerId,
1377 simd16vector prims[],
1378 uint32_t primMask,
1379 simd16scalari const& primId,
1380 simd16scalari const& viewportIdx,
1381 simd16scalari const& rtIdx);
1382 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC,
1383 PA_STATE& pa,
1384 uint32_t workerId,
1385 simd16vector prims[],
1386 uint32_t primMask,
1387 simd16scalari const& primId,
1388 simd16scalari const& viewportIdx,
1389 simd16scalari const& rtIdx);
1390 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC,
1391 PA_STATE& pa,
1392 uint32_t workerId,
1393 simd16vector prims[],
1394 uint32_t primMask,
1395 simd16scalari const& primId,
1396 simd16scalari const& viewportIdx,
1397 simd16scalari const& rtIdx);
1398 #endif