swr/rast: simdlib cleanup, clipper stack space fixes
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 enum SWR_CLIPCODES
36 {
37 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
38 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
39 // rather than intersection, of clipcodes.
40 #define CLIPCODE_SHIFT 23
41 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
42 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
43 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
44 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
45
46 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
47 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
48
49 NEGW = (0x40 << CLIPCODE_SHIFT),
50
51 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
52 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
53 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
54 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
55 };
56
57 #define GUARDBAND_CLIP_MASK \
58 (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
59 GUARDBAND_BOTTOM | NEGW)
60 #define FRUSTUM_CLIP_MASK \
61 (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
62
63 template <typename SIMD_T>
64 void ComputeClipCodes(const API_STATE& state,
65 const Vec4<SIMD_T>& vertex,
66 Float<SIMD_T>& clipCodes,
67 Integer<SIMD_T> const& viewportIndexes)
68 {
69 clipCodes = SIMD_T::setzero_ps();
70
71 // -w
72 Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
73
74 // FRUSTUM_LEFT
75 Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
76 clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
77
78 // FRUSTUM_TOP
79 vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
80 clipCodes = SIMD_T::or_ps(
81 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
82
83 // FRUSTUM_RIGHT
84 vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
85 clipCodes = SIMD_T::or_ps(
86 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
87
88 // FRUSTUM_BOTTOM
89 vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
90 clipCodes = SIMD_T::or_ps(
91 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
92
93 if (state.rastState.depthClipEnable)
94 {
95 // FRUSTUM_NEAR
96 // DX clips depth [0..w], GL clips [-w..w]
97 if (state.rastState.clipHalfZ)
98 {
99 vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
100 }
101 else
102 {
103 vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
104 }
105 clipCodes = SIMD_T::or_ps(
106 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
107
108 // FRUSTUM_FAR
109 vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
110 clipCodes = SIMD_T::or_ps(
111 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
112 }
113
114 // NEGW
115 vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
116 clipCodes =
117 SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
118
119 // GUARDBAND_LEFT
120 Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
121 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
122 &state.gbState.left[0], viewportIndexes));
123 vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
124 clipCodes = SIMD_T::or_ps(
125 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
126
127 // GUARDBAND_TOP
128 gbMult = SIMD_T::mul_ps(vNegW,
129 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
130 &state.gbState.top[0], viewportIndexes));
131 vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
132 clipCodes = SIMD_T::or_ps(
133 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
134
135 // GUARDBAND_RIGHT
136 gbMult = SIMD_T::mul_ps(vertex.w,
137 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
138 &state.gbState.right[0], viewportIndexes));
139 vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
140 clipCodes = SIMD_T::or_ps(
141 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
142
143 // GUARDBAND_BOTTOM
144 gbMult = SIMD_T::mul_ps(vertex.w,
145 SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
146 &state.gbState.bottom[0], viewportIndexes));
147 vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
148 clipCodes = SIMD_T::or_ps(
149 clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
150 }
151
152 template <typename SIMD_T>
153 struct BinnerChooser
154 {
155 };
156
157 template <>
158 struct BinnerChooser<SIMD256>
159 {
160 PFN_PROCESS_PRIMS pfnBinFunc;
161
162 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
163 :
164 pfnBinFunc(nullptr)
165 {
166 if (numVertsPerPrim == 3)
167 {
168 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
169
170 }
171 else if (numVertsPerPrim == 2)
172 {
173 pfnBinFunc = BinLines;
174 }
175 else
176 {
177 SWR_ASSERT(0 && "Unexpected points in clipper.");
178 }
179 }
180
181 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
182 :
183 pfnBinFunc(nullptr)
184 {
185 switch (topology)
186 {
187 case TOP_POINT_LIST:
188 pfnBinFunc = BinPoints;
189 break;
190 case TOP_LINE_LIST:
191 case TOP_LINE_STRIP:
192 case TOP_LINE_LOOP:
193 case TOP_LINE_LIST_ADJ:
194 case TOP_LISTSTRIP_ADJ:
195 pfnBinFunc = BinLines;
196 break;
197 default:
198 pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
199 break;
200 };
201 }
202
203 void BinFunc(DRAW_CONTEXT* pDC,
204 PA_STATE& pa,
205 uint32_t workerId,
206 SIMD256::Vec4 prims[],
207 uint32_t primMask,
208 SIMD256::Integer const& primID,
209 SIMD256::Integer& viewportIdx,
210 SIMD256::Integer& rtIdx)
211 {
212 SWR_ASSERT(pfnBinFunc != nullptr);
213
214 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
215 }
216 };
217
218 #if USE_SIMD16_FRONTEND
219 template <>
220 struct BinnerChooser<SIMD512>
221 {
222 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
223
224 BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
225 :
226 pfnBinFunc(nullptr)
227 {
228 if (numVertsPerPrim == 3)
229 {
230 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
231
232 }
233 else if (numVertsPerPrim == 2)
234 {
235 pfnBinFunc = BinLines_simd16;
236 }
237 else
238 {
239 SWR_ASSERT(0 && "Unexpected points in clipper.");
240 }
241 }
242
243 BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
244 :
245 pfnBinFunc(nullptr)
246 {
247 switch (topology)
248 {
249 case TOP_POINT_LIST:
250 pfnBinFunc = BinPoints_simd16;
251 break;
252 case TOP_LINE_LIST:
253 case TOP_LINE_STRIP:
254 case TOP_LINE_LOOP:
255 case TOP_LINE_LIST_ADJ:
256 case TOP_LISTSTRIP_ADJ:
257 pfnBinFunc = BinLines_simd16;
258 break;
259 default:
260 pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
261 break;
262 };
263 }
264
265 void BinFunc(DRAW_CONTEXT* pDC,
266 PA_STATE& pa,
267 uint32_t workerId,
268 SIMD512::Vec4 prims[],
269 uint32_t primMask,
270 SIMD512::Integer const& primID,
271 SIMD512::Integer& viewportIdx,
272 SIMD512::Integer& rtIdx)
273 {
274 SWR_ASSERT(pfnBinFunc != nullptr);
275
276 pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
277 }
278 };
279
280 #endif
281 template <typename SIMD_T>
282 struct SimdHelper
283 {
284 };
285
286 template <>
287 struct SimdHelper<SIMD256>
288 {
289 static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
290
291 static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
292 {
293 return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
294 }
295 };
296
297 #if USE_SIMD16_FRONTEND
298 template <>
299 struct SimdHelper<SIMD512>
300 {
301 static SIMD512::Float insert_lo_ps(SIMD256::Float a)
302 {
303 return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
304 }
305
306 static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
307 {
308 return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
309 }
310 };
311 #endif
312
313 template <typename SIMD_T, uint32_t NumVertsPerPrimT>
314 class Clipper
315 {
316 public:
317 INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
318 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
319 {
320 static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim");
321 THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId];
322
323 if (thread_data.clipperData == nullptr)
324 {
325 // 7 vertex temp data
326 // 7 post-clipped vertices
327 // 2 transposed verts for binning
328 size_t alloc_size = sizeof(SIMDVERTEX_T<SIMD_T>) * (7 + 7 + 2);
329 thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES);
330 }
331 SWR_ASSERT(thread_data.clipperData);
332
333 this->clippedVerts = (SIMDVERTEX_T<SIMD_T>*)thread_data.clipperData;
334 this->tmpVerts = this->clippedVerts + 7;
335 this->transposedVerts = this->tmpVerts + 7;
336 }
337
338 void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
339 {
340 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
341 {
342 ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
343 }
344 }
345
346 Float<SIMD_T> ComputeClipCodeIntersection()
347 {
348 Float<SIMD_T> result = clipCodes[0];
349
350 for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
351 {
352 result = SIMD_T::and_ps(result, clipCodes[i]);
353 }
354
355 return result;
356 }
357
358 Float<SIMD_T> ComputeClipCodeUnion()
359 {
360 Float<SIMD_T> result = clipCodes[0];
361
362 for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
363 {
364 result = SIMD_T::or_ps(result, clipCodes[i]);
365 }
366
367 return result;
368 }
369
370 int ComputeClipMask()
371 {
372 Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
373
374 clipUnion =
375 SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
376
377 return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
378 }
379
380 // clipper is responsible for culling any prims with NAN coordinates
381 int ComputeNaNMask(Vec4<SIMD_T> prim[])
382 {
383 Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
384
385 for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
386 {
387 Float<SIMD_T> vNan01 =
388 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
389 vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
390
391 Float<SIMD_T> vNan23 =
392 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
393 vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
394 }
395
396 return SIMD_T::movemask_ps(vNanMask);
397 }
398
399 int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
400 {
401 uint8_t cullMask = state.backendState.cullDistanceMask;
402 uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
403
404 Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
405
406 Vec4<SIMD_T> vClipCullDistLo[3];
407 Vec4<SIMD_T> vClipCullDistHi[3];
408
409 pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
410 pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
411
412 DWORD index;
413 while (_BitScanForward(&index, cullMask))
414 {
415 cullMask &= ~(1 << index);
416 uint32_t slot = index >> 2;
417 uint32_t component = index & 0x3;
418
419 Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
420 for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
421 {
422 Float<SIMD_T> vCullComp;
423 if (slot == 0)
424 {
425 vCullComp = vClipCullDistLo[e][component];
426 }
427 else
428 {
429 vCullComp = vClipCullDistHi[e][component];
430 }
431
432 // cull if cull distance < 0 || NAN
433 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
434 SIMD_T::setzero_ps(), vCullComp);
435 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
436 }
437 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
438 }
439
440 // clipper should also discard any primitive with NAN clip distance
441 uint8_t clipMask = state.backendState.clipDistanceMask;
442 while (_BitScanForward(&index, clipMask))
443 {
444 clipMask &= ~(1 << index);
445 uint32_t slot = index >> 2;
446 uint32_t component = index & 0x3;
447
448 Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
449 for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
450 {
451 Float<SIMD_T> vClipComp;
452 if (slot == 0)
453 {
454 vClipComp = vClipCullDistLo[e][component];
455 }
456 else
457 {
458 vClipComp = vClipCullDistHi[e][component];
459 }
460
461 Float<SIMD_T> vClip =
462 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
463 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
464 SIMD_T::setzero_ps(), vClipComp);
465 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
466 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
467 }
468 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
469 }
470
471 return SIMD_T::movemask_ps(vClipCullMask);
472 }
473
474 void ClipSimd(const Vec4<SIMD_T> prim[],
475 const Float<SIMD_T>& vPrimMask,
476 const Float<SIMD_T>& vClipMask,
477 PA_STATE& pa,
478 const Integer<SIMD_T>& vPrimId,
479 const Integer<SIMD_T>& vViewportIdx,
480 const Integer<SIMD_T>& vRtIdx)
481 {
482 // input/output vertex store for clipper
483 SIMDVERTEX_T<SIMD_T>* vertices = this->clippedVerts;
484
485 uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
486 uint32_t provokingVertex = 0;
487 if (pa.binTopology == TOP_TRIANGLE_FAN)
488 {
489 provokingVertex = state.frontendState.provokingVertex.triFan;
490 }
491 ///@todo: line topology for wireframe?
492
493 // assemble pos
494 Vec4<SIMD_T> tmpVector[NumVertsPerPrimT];
495 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
496 {
497 vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
498 }
499
500 // assemble attribs
501 const SWR_BACKEND_STATE& backendState = state.backendState;
502
503 int32_t maxSlot = -1;
504 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
505 {
506 // Compute absolute attrib slot in vertex array
507 uint32_t mapSlot =
508 backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
509 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
510 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
511
512 pa.Assemble(inputSlot, tmpVector);
513
514 // if constant interpolation enabled for this attribute, assign the provoking
515 // vertex values to all edges
516 if (CheckBit(constantInterpMask, slot))
517 {
518 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
519 {
520 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
521 }
522 }
523 else
524 {
525 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
526 {
527 vertices[i].attrib[inputSlot] = tmpVector[i];
528 }
529 }
530 }
531
532 // assemble user clip distances if enabled
533 uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
534 if (state.backendState.clipDistanceMask & 0xf)
535 {
536 pa.Assemble(vertexClipCullSlot, tmpVector);
537 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
538 {
539 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
540 }
541 }
542
543 if (state.backendState.clipDistanceMask & 0xf0)
544 {
545 pa.Assemble(vertexClipCullSlot + 1, tmpVector);
546 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
547 {
548 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
549 }
550 }
551
552 uint32_t numAttribs = maxSlot + 1;
553
554 Integer<SIMD_T> vNumClippedVerts =
555 ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
556
557 BinnerChooser<SIMD_T> binner(NumVertsPerPrimT,
558 pa.pDC->pState->state.rastState.conservativeRast);
559
560 // set up new PA for binning clipped primitives
561 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
562 if (NumVertsPerPrimT == 3)
563 {
564 clipTopology = TOP_TRIANGLE_FAN;
565
566 // so that the binner knows to bloat wide points later
567 if (pa.binTopology == TOP_POINT_LIST)
568 {
569 clipTopology = TOP_POINT_LIST;
570 }
571 else if (pa.binTopology == TOP_RECT_LIST)
572 {
573 clipTopology = TOP_RECT_LIST;
574 }
575 }
576 else if (NumVertsPerPrimT == 2)
577 {
578 clipTopology = TOP_LINE_LIST;
579 }
580 else
581 {
582 SWR_ASSERT(0 && "Unexpected points in clipper.");
583 }
584
585 const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
586 const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
587 const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
588 const uint32_t* pRtIdx = reinterpret_cast<const uint32_t*>(&vRtIdx);
589
590 const SIMD256::Integer vOffsets =
591 SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
592 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
593 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
594 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
595 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
596 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
597 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
598 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
599
600 // only need to gather 7 verts
601 // @todo dynamic mask based on actual # of verts generated per lane
602 const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
603
604 uint32_t numClippedPrims = 0;
605
606 // transpose clipper output so that each lane's vertices are in SIMD order
607 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
608 // for triangle fan
609 SIMDVERTEX_T<SIMD_T>* transposedPrims = this->transposedVerts;
610
611 uint32_t numInputPrims = pa.NumPrims();
612 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
613 {
614 uint32_t numEmittedVerts = pVertexCount[inputPrim];
615 if (numEmittedVerts < NumVertsPerPrimT)
616 {
617 continue;
618 }
619 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
620
621 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
622 SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
623
624 numClippedPrims += numEmittedPrims;
625
626 // tranpose clipper output so that each lane's vertices are in SIMD order
627 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
628 // for triangle fan
629
630 // transpose pos
631 float const* pBase =
632 reinterpret_cast<float const*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
633 inputPrim;
634
635 for (uint32_t c = 0; c < 4; ++c)
636 {
637 SIMD256::Float temp =
638 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
639 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
640 SimdHelper<SIMD_T>::insert_lo_ps(temp);
641 pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
642 }
643
644 // transpose attribs
645 pBase = reinterpret_cast<float const*>(
646 &vertices[0].attrib[backendState.vertexAttribOffset]) +
647 inputPrim;
648
649 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
650 {
651 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
652
653 for (uint32_t c = 0; c < 4; ++c)
654 {
655 SIMD256::Float temp =
656 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
657 transposedPrims[0].attrib[attribSlot][c] =
658 SimdHelper<SIMD_T>::insert_lo_ps(temp);
659 pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
660 }
661 }
662
663 // transpose user clip distances if enabled
664 uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
665 if (state.backendState.clipDistanceMask & 0x0f)
666 {
667 pBase = reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot]) +
668 inputPrim;
669
670 for (uint32_t c = 0; c < 4; ++c)
671 {
672 SIMD256::Float temp =
673 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
674 transposedPrims[0].attrib[vertexClipCullSlot][c] =
675 SimdHelper<SIMD_T>::insert_lo_ps(temp);
676 pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
677 }
678 }
679
680 if (state.backendState.clipDistanceMask & 0xf0)
681 {
682 pBase =
683 reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
684 inputPrim;
685
686 for (uint32_t c = 0; c < 4; ++c)
687 {
688 SIMD256::Float temp =
689 SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
690 transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
691 SimdHelper<SIMD_T>::insert_lo_ps(temp);
692 pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
693 }
694 }
695
696 PA_STATE_OPT clipPA(pDC,
697 numEmittedPrims,
698 reinterpret_cast<uint8_t*>(&transposedPrims[0]),
699 numEmittedVerts,
700 SWR_VTX_NUM_SLOTS,
701 true,
702 NumVertsPerPrimT,
703 clipTopology);
704 clipPA.viewportArrayActive = pa.viewportArrayActive;
705 clipPA.rtArrayActive = pa.rtArrayActive;
706
707 static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
708
709 const uint32_t primMask = primMaskMap[numEmittedPrims];
710
711 const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
712 const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
713 const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
714
715 while (clipPA.GetNextStreamOutput())
716 {
717 do
718 {
719 Vec4<SIMD_T> attrib[NumVertsPerPrimT];
720
721 bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
722
723 if (assemble)
724 {
725 binner.pfnBinFunc(
726 pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
727 }
728
729 } while (clipPA.NextPrim());
730 }
731 }
732
733 // update global pipeline stat
734 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
735 }
736
737 void ExecuteStage(PA_STATE& pa,
738 Vec4<SIMD_T> prim[],
739 uint32_t primMask,
740 Integer<SIMD_T> const& primId,
741 Integer<SIMD_T> const& viewportIdx,
742 Integer<SIMD_T> const& rtIdx)
743 {
744 SWR_ASSERT(pa.pDC != nullptr);
745
746 BinnerChooser<SIMD_T> binner(pa.binTopology,
747 pa.pDC->pState->state.rastState.conservativeRast);
748
749 // update clipper invocations pipeline stat
750 uint32_t numInvoc = _mm_popcnt_u32(primMask);
751 UPDATE_STAT_FE(CInvocations, numInvoc);
752
753 ComputeClipCodes(prim, viewportIdx);
754
755 // cull prims with NAN coords
756 primMask &= ~ComputeNaNMask(prim);
757
758 // user cull distance cull
759 if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
760 {
761 primMask &= ~ComputeUserClipCullMask(pa, prim);
762 }
763
764 Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
765 // Mask out non-frustum codes
766 clipIntersection = SIMD_T::and_ps(clipIntersection,
767 SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
768
769 // cull prims outside view frustum
770 int validMask =
771 primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
772
773 // skip clipping for points
774 uint32_t clipMask = 0;
775 if (NumVertsPerPrimT != 1)
776 {
777 clipMask = validMask & ComputeClipMask();
778 }
779
780 AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
781
782 if (clipMask)
783 {
784 RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
785 // we have to clip tris, execute the clipper, which will also
786 // call the binner
787 ClipSimd(prim,
788 SIMD_T::vmask_ps(validMask),
789 SIMD_T::vmask_ps(clipMask),
790 pa,
791 primId,
792 viewportIdx,
793 rtIdx);
794 RDTSC_END(FEGuardbandClip, 1);
795 }
796 else if (validMask)
797 {
798 // update CPrimitives pipeline state
799 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
800
801 // forward valid prims directly to binner
802 binner.pfnBinFunc(
803 this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
804 }
805 }
806
807 private:
808 Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
809 Float<SIMD_T> const& boundaryCoord1)
810 {
811 return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
812 }
813
814 Integer<SIMD_T>
815 ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
816 {
817 const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
818 const uint32_t componentStride = sizeof(Float<SIMD_T>);
819 const uint32_t attribStride = sizeof(Vec4<SIMD_T>);
820
821 static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
822 0 * sizeof(float),
823 1 * sizeof(float),
824 2 * sizeof(float),
825 3 * sizeof(float),
826 4 * sizeof(float),
827 5 * sizeof(float),
828 6 * sizeof(float),
829 7 * sizeof(float),
830 8 * sizeof(float),
831 9 * sizeof(float),
832 10 * sizeof(float),
833 11 * sizeof(float),
834 12 * sizeof(float),
835 13 * sizeof(float),
836 14 * sizeof(float),
837 15 * sizeof(float),
838 };
839
840 static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
841 "Clipper::ComputeOffsets, Increase number of element offsets.");
842
843 Integer<SIMD_T> vElemOffset =
844 SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
845
846 // step to the simdvertex
847 Integer<SIMD_T> vOffsets =
848 SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
849
850 // step to the attribute and component
851 vOffsets = SIMD_T::add_epi32(
852 vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
853
854 // step to the lane
855 vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
856
857 return vOffsets;
858 }
859
860 Float<SIMD_T> GatherComponent(const float* pBuffer,
861 uint32_t attrib,
862 Float<SIMD_T> const& vMask,
863 Integer<SIMD_T> const& vIndices,
864 uint32_t component)
865 {
866 Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
867 Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
868
869 return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask);
870 }
871
872 void ScatterComponent(const float* pBuffer,
873 uint32_t attrib,
874 Float<SIMD_T> const& vMask,
875 Integer<SIMD_T> const& vIndices,
876 uint32_t component,
877 Float<SIMD_T> const& vSrc)
878 {
879 Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
880
881 const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
882 const float* pSrc = reinterpret_cast<const float*>(&vSrc);
883 uint32_t mask = SIMD_T::movemask_ps(vMask);
884 DWORD lane;
885 while (_BitScanForward(&lane, mask))
886 {
887 mask &= ~(1 << lane);
888 const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
889 *(float*)pBuf = pSrc[lane];
890 }
891 }
892
893 template <SWR_CLIPCODES ClippingPlane>
894 void intersect(const Float<SIMD_T>& vActiveMask, // active lanes to operate on
895 const Integer<SIMD_T>& s, // index to first edge vertex v0 in pInPts.
896 const Integer<SIMD_T>& p, // index to second edge vertex v1 in pInPts.
897 const Vec4<SIMD_T>& v1, // vertex 0 position
898 const Vec4<SIMD_T>& v2, // vertex 1 position
899 Integer<SIMD_T>& outIndex, // output index.
900 const float* pInVerts, // array of all the input positions.
901 uint32_t numInAttribs, // number of attributes per vertex.
902 float* pOutVerts) // array of output positions. We'll write our new intersection
903 // point at i*4.
904 {
905 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
906 uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
907
908 // compute interpolation factor
909 Float<SIMD_T> t;
910 switch (ClippingPlane)
911 {
912 case FRUSTUM_LEFT:
913 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
914 break;
915 case FRUSTUM_RIGHT:
916 t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
917 break;
918 case FRUSTUM_TOP:
919 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
920 break;
921 case FRUSTUM_BOTTOM:
922 t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
923 break;
924 case FRUSTUM_NEAR:
925 // DX Znear plane is 0, GL is -w
926 if (this->state.rastState.clipHalfZ)
927 {
928 t = ComputeInterpFactor(v1[2], v2[2]);
929 }
930 else
931 {
932 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
933 }
934 break;
935 case FRUSTUM_FAR:
936 t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
937 break;
938 default:
939 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
940 };
941
942 // interpolate position and store
943 for (uint32_t c = 0; c < 4; ++c)
944 {
945 Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
946 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
947 }
948
949 // interpolate attributes and store
950 for (uint32_t a = 0; a < numInAttribs; ++a)
951 {
952 uint32_t attribSlot = vertexAttribOffset + a;
953 for (uint32_t c = 0; c < 4; ++c)
954 {
955 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
956 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
957 Float<SIMD_T> vOutAttrib =
958 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
959 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
960 }
961 }
962
963 // interpolate clip distance if enabled
964 if (this->state.backendState.clipDistanceMask & 0xf)
965 {
966 uint32_t attribSlot = vertexClipCullOffset;
967 for (uint32_t c = 0; c < 4; ++c)
968 {
969 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
970 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
971 Float<SIMD_T> vOutAttrib =
972 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
973 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
974 }
975 }
976
977 if (this->state.backendState.clipDistanceMask & 0xf0)
978 {
979 uint32_t attribSlot = vertexClipCullOffset + 1;
980 for (uint32_t c = 0; c < 4; ++c)
981 {
982 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
983 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
984 Float<SIMD_T> vOutAttrib =
985 SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
986 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
987 }
988 }
989 }
990
991 template <SWR_CLIPCODES ClippingPlane>
992 Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
993 {
994 switch (ClippingPlane)
995 {
996 case FRUSTUM_LEFT:
997 return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
998 case FRUSTUM_RIGHT:
999 return SIMD_T::cmple_ps(v[0], v[3]);
1000 case FRUSTUM_TOP:
1001 return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1002 case FRUSTUM_BOTTOM:
1003 return SIMD_T::cmple_ps(v[1], v[3]);
1004 case FRUSTUM_NEAR:
1005 return SIMD_T::cmpge_ps(v[2],
1006 this->state.rastState.clipHalfZ
1007 ? SIMD_T::setzero_ps()
1008 : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1009 case FRUSTUM_FAR:
1010 return SIMD_T::cmple_ps(v[2], v[3]);
1011 default:
1012 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1013 return SIMD_T::setzero_ps();
1014 }
1015 }
1016
1017 template <SWR_CLIPCODES ClippingPlane>
1018 Integer<SIMD_T> ClipTriToPlane(const float* pInVerts,
1019 const Integer<SIMD_T>& vNumInPts,
1020 uint32_t numInAttribs,
1021 float* pOutVerts)
1022 {
1023 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1024
1025 Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
1026 Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
1027 Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1028
1029 while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1030 {
1031 Integer<SIMD_T> s = vCurIndex;
1032 Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1033 Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
1034 p = SIMD_T::castps_si(SIMD_T::blendv_ps(
1035 SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
1036
1037 // gather position
1038 Vec4<SIMD_T> vInPos0, vInPos1;
1039 for (uint32_t c = 0; c < 4; ++c)
1040 {
1041 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1042 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1043 }
1044
1045 // compute inside mask
1046 Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1047 Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1048
1049 // compute intersection mask (s_in != p_in)
1050 Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1051 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1052
1053 // store s if inside
1054 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1055 if (!SIMD_T::testz_ps(s_in, s_in))
1056 {
1057 // store position
1058 for (uint32_t c = 0; c < 4; ++c)
1059 {
1060 ScatterComponent(
1061 pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1062 }
1063
1064 // store attribs
1065 for (uint32_t a = 0; a < numInAttribs; ++a)
1066 {
1067 uint32_t attribSlot = vertexAttribOffset + a;
1068 for (uint32_t c = 0; c < 4; ++c)
1069 {
1070 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1071 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1072 }
1073 }
1074
1075 // store clip distance if enabled
1076 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
1077 if (this->state.backendState.clipDistanceMask & 0xf)
1078 {
1079 uint32_t attribSlot = vertexClipCullSlot;
1080 for (uint32_t c = 0; c < 4; ++c)
1081 {
1082 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1083 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1084 }
1085 }
1086
1087 if (this->state.backendState.clipDistanceMask & 0xf0)
1088 {
1089 uint32_t attribSlot = vertexClipCullSlot + 1;
1090 for (uint32_t c = 0; c < 4; ++c)
1091 {
1092 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1093 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1094 }
1095 }
1096
1097 // increment outIndex
1098 vOutIndex = SIMD_T::blendv_epi32(
1099 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1100 }
1101
1102 // compute and store intersection
1103 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1104 {
1105 intersect<ClippingPlane>(intersectMask,
1106 s,
1107 p,
1108 vInPos0,
1109 vInPos1,
1110 vOutIndex,
1111 pInVerts,
1112 numInAttribs,
1113 pOutVerts);
1114
1115 // increment outIndex for active lanes
1116 vOutIndex = SIMD_T::blendv_epi32(
1117 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1118 }
1119
1120 // increment loop index and update active mask
1121 vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1122 vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1123 }
1124
1125 return vOutIndex;
1126 }
1127
1128 template <SWR_CLIPCODES ClippingPlane>
1129 Integer<SIMD_T> ClipLineToPlane(const float* pInVerts,
1130 const Integer<SIMD_T>& vNumInPts,
1131 uint32_t numInAttribs,
1132 float* pOutVerts)
1133 {
1134 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1135
1136 Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
1137 Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
1138 Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1139
1140 if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1141 {
1142 Integer<SIMD_T> s = vCurIndex;
1143 Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1144
1145 // gather position
1146 Vec4<SIMD_T> vInPos0, vInPos1;
1147 for (uint32_t c = 0; c < 4; ++c)
1148 {
1149 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1150 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1151 }
1152
1153 // compute inside mask
1154 Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1155 Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1156
1157 // compute intersection mask (s_in != p_in)
1158 Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1159 intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1160
1161 // store s if inside
1162 s_in = SIMD_T::and_ps(s_in, vActiveMask);
1163 if (!SIMD_T::testz_ps(s_in, s_in))
1164 {
1165 for (uint32_t c = 0; c < 4; ++c)
1166 {
1167 ScatterComponent(
1168 pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1169 }
1170
1171 // interpolate attributes and store
1172 for (uint32_t a = 0; a < numInAttribs; ++a)
1173 {
1174 uint32_t attribSlot = vertexAttribOffset + a;
1175 for (uint32_t c = 0; c < 4; ++c)
1176 {
1177 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1178 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1179 }
1180 }
1181
1182 // increment outIndex
1183 vOutIndex = SIMD_T::blendv_epi32(
1184 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1185 }
1186
1187 // compute and store intersection
1188 if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1189 {
1190 intersect<ClippingPlane>(intersectMask,
1191 s,
1192 p,
1193 vInPos0,
1194 vInPos1,
1195 vOutIndex,
1196 pInVerts,
1197 numInAttribs,
1198 pOutVerts);
1199
1200 // increment outIndex for active lanes
1201 vOutIndex = SIMD_T::blendv_epi32(
1202 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1203 }
1204
1205 // store p if inside
1206 p_in = SIMD_T::and_ps(p_in, vActiveMask);
1207 if (!SIMD_T::testz_ps(p_in, p_in))
1208 {
1209 for (uint32_t c = 0; c < 4; ++c)
1210 {
1211 ScatterComponent(
1212 pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1213 }
1214
1215 // interpolate attributes and store
1216 for (uint32_t a = 0; a < numInAttribs; ++a)
1217 {
1218 uint32_t attribSlot = vertexAttribOffset + a;
1219 for (uint32_t c = 0; c < 4; ++c)
1220 {
1221 Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1222 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1223 }
1224 }
1225
1226 // increment outIndex
1227 vOutIndex = SIMD_T::blendv_epi32(
1228 vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1229 }
1230 }
1231
1232 return vOutIndex;
1233 }
1234
1235 Integer<SIMD_T> ClipPrims(float* pVertices,
1236 const Float<SIMD_T>& vPrimMask,
1237 const Float<SIMD_T>& vClipMask,
1238 int numAttribs)
1239 {
1240 // temp storage
1241 float* pTempVerts = reinterpret_cast<float*>(this->tmpVerts);
1242
1243 // zero out num input verts for non-active lanes
1244 Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT);
1245 vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1246
1247 // clip prims to frustum
1248 Integer<SIMD_T> vNumOutPts;
1249 if (NumVertsPerPrimT == 3)
1250 {
1251 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1252 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1253 vNumOutPts =
1254 ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1255 vNumOutPts =
1256 ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1257 vNumOutPts =
1258 ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1259 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1260 }
1261 else
1262 {
1263 SWR_ASSERT(NumVertsPerPrimT == 2);
1264 vNumOutPts =
1265 ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1266 vNumOutPts =
1267 ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1268 vNumOutPts =
1269 ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1270 vNumOutPts =
1271 ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1272 vNumOutPts =
1273 ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1274 vNumOutPts =
1275 ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1276 }
1277
1278 // restore num verts for non-clipped, active lanes
1279 Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1280 vNumOutPts =
1281 SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask);
1282
1283 return vNumOutPts;
1284 }
1285
1286 const uint32_t workerId{0};
1287 DRAW_CONTEXT* pDC{nullptr};
1288 const API_STATE& state;
1289 Float<SIMD_T> clipCodes[NumVertsPerPrimT];
1290 SIMDVERTEX_T<SIMD_T>* clippedVerts;
1291 SIMDVERTEX_T<SIMD_T>* tmpVerts;
1292 SIMDVERTEX_T<SIMD_T>* transposedVerts;
1293 };
1294
1295 // pipeline stage functions
1296 void ClipRectangles(DRAW_CONTEXT* pDC,
1297 PA_STATE& pa,
1298 uint32_t workerId,
1299 simdvector prims[],
1300 uint32_t primMask,
1301 simdscalari const& primId,
1302 simdscalari const& viewportIdx,
1303 simdscalari const& rtIdx);
1304 void ClipTriangles(DRAW_CONTEXT* pDC,
1305 PA_STATE& pa,
1306 uint32_t workerId,
1307 simdvector prims[],
1308 uint32_t primMask,
1309 simdscalari const& primId,
1310 simdscalari const& viewportIdx,
1311 simdscalari const& rtIdx);
1312 void ClipLines(DRAW_CONTEXT* pDC,
1313 PA_STATE& pa,
1314 uint32_t workerId,
1315 simdvector prims[],
1316 uint32_t primMask,
1317 simdscalari const& primId,
1318 simdscalari const& viewportIdx,
1319 simdscalari const& rtIdx);
1320 void ClipPoints(DRAW_CONTEXT* pDC,
1321 PA_STATE& pa,
1322 uint32_t workerId,
1323 simdvector prims[],
1324 uint32_t primMask,
1325 simdscalari const& primId,
1326 simdscalari const& viewportIdx,
1327 simdscalari const& rtIdx);
1328 #if USE_SIMD16_FRONTEND
1329 void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC,
1330 PA_STATE& pa,
1331 uint32_t workerId,
1332 simd16vector prims[],
1333 uint32_t primMask,
1334 simd16scalari const& primId,
1335 simd16scalari const& viewportIdx,
1336 simd16scalari const& rtIdx);
1337 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC,
1338 PA_STATE& pa,
1339 uint32_t workerId,
1340 simd16vector prims[],
1341 uint32_t primMask,
1342 simd16scalari const& primId,
1343 simd16scalari const& viewportIdx,
1344 simd16scalari const& rtIdx);
1345 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC,
1346 PA_STATE& pa,
1347 uint32_t workerId,
1348 simd16vector prims[],
1349 uint32_t primMask,
1350 simd16scalari const& primId,
1351 simd16scalari const& viewportIdx,
1352 simd16scalari const& rtIdx);
1353 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC,
1354 PA_STATE& pa,
1355 uint32_t workerId,
1356 simd16vector prims[],
1357 uint32_t primMask,
1358 simd16scalari const& primId,
1359 simd16scalari const& viewportIdx,
1360 simd16scalari const& rtIdx);
1361 #endif