swr: [rasterizer core] SIMD16 Frontend WIP
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices[7];
37
38 enum SWR_CLIPCODES
39 {
40 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
41 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
42 #define CLIPCODE_SHIFT 23
43 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
44 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
45 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
46 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
47
48 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
49 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
50
51 NEGW = (0x40 << CLIPCODE_SHIFT),
52
53 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
54 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
55 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
56 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
57 };
58
59 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
60
61 INLINE
62 void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
63 {
64 clipCodes = _simd_setzero_ps();
65
66 // -w
67 simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
68
69 // FRUSTUM_LEFT
70 simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
71 clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
72
73 // FRUSTUM_TOP
74 vRes = _simd_cmplt_ps(vertex.y, vNegW);
75 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
76
77 // FRUSTUM_RIGHT
78 vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
79 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
80
81 // FRUSTUM_BOTTOM
82 vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
83 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
84
85 if (state.rastState.depthClipEnable)
86 {
87 // FRUSTUM_NEAR
88 // DX clips depth [0..w], GL clips [-w..w]
89 if (state.rastState.clipHalfZ)
90 {
91 vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
92 }
93 else
94 {
95 vRes = _simd_cmplt_ps(vertex.z, vNegW);
96 }
97 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
98
99 // FRUSTUM_FAR
100 vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
101 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
102 }
103
104 // NEGW
105 vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
106 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
107
108 // GUARDBAND_LEFT
109 simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
110 vRes = _simd_cmplt_ps(vertex.x, gbMult);
111 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
112
113 // GUARDBAND_TOP
114 gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
115 vRes = _simd_cmplt_ps(vertex.y, gbMult);
116 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
117
118 // GUARDBAND_RIGHT
119 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
120 vRes = _simd_cmpgt_ps(vertex.x, gbMult);
121 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
122
123 // GUARDBAND_BOTTOM
124 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
125 vRes = _simd_cmpgt_ps(vertex.y, gbMult);
126 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
127 }
128
129 template<uint32_t NumVertsPerPrim>
130 class Clipper
131 {
132 public:
133 Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
134 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
135 {
136 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
137 }
138
139 void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
140 {
141 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
142 {
143 ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
144 }
145 }
146
147 simdscalar ComputeClipCodeIntersection()
148 {
149 simdscalar result = this->clipCodes[0];
150 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
151 {
152 result = _simd_and_ps(result, this->clipCodes[i]);
153 }
154 return result;
155 }
156
157 simdscalar ComputeClipCodeUnion()
158 {
159 simdscalar result = this->clipCodes[0];
160 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
161 {
162 result = _simd_or_ps(result, this->clipCodes[i]);
163 }
164 return result;
165 }
166
167 int ComputeNegWMask()
168 {
169 simdscalar clipCodeUnion = ComputeClipCodeUnion();
170 clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
171 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
172 }
173
174 int ComputeClipMask()
175 {
176 simdscalar clipUnion = ComputeClipCodeUnion();
177 clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
178 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
179 }
180
181 // clipper is responsible for culling any prims with NAN coordinates
182 int ComputeNaNMask(simdvector prim[])
183 {
184 simdscalar vNanMask = _simd_setzero_ps();
185 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
186 {
187 simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
188 vNanMask = _simd_or_ps(vNanMask, vNan01);
189 simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
190 vNanMask = _simd_or_ps(vNanMask, vNan23);
191 }
192
193 return _simd_movemask_ps(vNanMask);
194 }
195
196 int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
197 {
198 uint8_t cullMask = this->state.rastState.cullDistanceMask;
199 simdscalar vClipCullMask = _simd_setzero_ps();
200 DWORD index;
201
202 simdvector vClipCullDistLo[3];
203 simdvector vClipCullDistHi[3];
204
205 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
206 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
207 while (_BitScanForward(&index, cullMask))
208 {
209 cullMask &= ~(1 << index);
210 uint32_t slot = index >> 2;
211 uint32_t component = index & 0x3;
212
213 simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
214 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
215 {
216 simdscalar vCullComp;
217 if (slot == 0)
218 {
219 vCullComp = vClipCullDistLo[e][component];
220 }
221 else
222 {
223 vCullComp = vClipCullDistHi[e][component];
224 }
225
226 // cull if cull distance < 0 || NAN
227 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
228 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
229 }
230 vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
231 }
232
233 // clipper should also discard any primitive with NAN clip distance
234 uint8_t clipMask = this->state.rastState.clipDistanceMask;
235 while (_BitScanForward(&index, clipMask))
236 {
237 clipMask &= ~(1 << index);
238 uint32_t slot = index >> 2;
239 uint32_t component = index & 0x3;
240
241 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
242 {
243 simdscalar vClipComp;
244 if (slot == 0)
245 {
246 vClipComp = vClipCullDistLo[e][component];
247 }
248 else
249 {
250 vClipComp = vClipCullDistHi[e][component];
251 }
252
253 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
254 vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
255 }
256 }
257
258 return _simd_movemask_ps(vClipCullMask);
259 }
260
261 // clip SIMD primitives
262 void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
263 {
264 // input/output vertex store for clipper
265 simdvertex vertices[7]; // maximum 7 verts generated per triangle
266
267 LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
268 uint32_t provokingVertex = 0;
269 if(pa.binTopology == TOP_TRIANGLE_FAN)
270 {
271 provokingVertex = this->state.frontendState.provokingVertex.triFan;
272 }
273 ///@todo: line topology for wireframe?
274
275 // assemble pos
276 simdvector tmpVector[NumVertsPerPrim];
277 pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
278 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
279 {
280 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
281 }
282
283 // assemble attribs
284 const SWR_BACKEND_STATE& backendState = this->state.backendState;
285
286 int32_t maxSlot = -1;
287 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
288 {
289 // Compute absolute attrib slot in vertex array
290 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
291 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
292 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
293
294 pa.Assemble(inputSlot, tmpVector);
295
296 // if constant interpolation enabled for this attribute, assign the provoking
297 // vertex values to all edges
298 if (_bittest(&constantInterpMask, slot))
299 {
300 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
301 {
302 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
303 }
304 }
305 else
306 {
307 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
308 {
309 vertices[i].attrib[inputSlot] = tmpVector[i];
310 }
311 }
312 }
313
314 // assemble user clip distances if enabled
315 if (this->state.rastState.clipDistanceMask & 0xf)
316 {
317 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
318 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
319 {
320 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
321 }
322 }
323
324 if (this->state.rastState.clipDistanceMask & 0xf0)
325 {
326 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
327 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
328 {
329 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
330 }
331 }
332
333 uint32_t numAttribs = maxSlot + 1;
334
335 simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
336
337 // set up new PA for binning clipped primitives
338 PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
339 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
340 if (NumVertsPerPrim == 3)
341 {
342 pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
343 clipTopology = TOP_TRIANGLE_FAN;
344
345 // so that the binner knows to bloat wide points later
346 if (pa.binTopology == TOP_POINT_LIST)
347 clipTopology = TOP_POINT_LIST;
348
349 }
350 else if (NumVertsPerPrim == 2)
351 {
352 pfnBinFunc = BinLines;
353 clipTopology = TOP_LINE_LIST;
354 }
355 else
356 {
357 SWR_ASSERT(0 && "Unexpected points in clipper.");
358 }
359
360 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
361 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
362 uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
363
364 const simdscalari vOffsets = _mm256_set_epi32(
365 0 * sizeof(simdvertex), // unused lane
366 6 * sizeof(simdvertex),
367 5 * sizeof(simdvertex),
368 4 * sizeof(simdvertex),
369 3 * sizeof(simdvertex),
370 2 * sizeof(simdvertex),
371 1 * sizeof(simdvertex),
372 0 * sizeof(simdvertex));
373
374 // only need to gather 7 verts
375 // @todo dynamic mask based on actual # of verts generated per lane
376 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
377
378 uint32_t numClippedPrims = 0;
379 #if USE_SIMD16_FRONTEND
380 const uint32_t numPrims = pa.NumPrims();
381 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
382
383 SWR_ASSERT(numPrims <= numPrims_lo);
384
385 for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim)
386 #else
387 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
388 #endif
389 {
390 uint32_t numEmittedVerts = pVertexCount[inputPrim];
391 if (numEmittedVerts < NumVertsPerPrim)
392 {
393 continue;
394 }
395 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
396
397 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
398 numClippedPrims += numEmittedPrims;
399
400 // tranpose clipper output so that each lane's vertices are in SIMD order
401 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
402 // for triangle fan
403 #if USE_SIMD16_FRONTEND
404 simd16vertex transposedPrims[2];
405 #else
406 simdvertex transposedPrims[2];
407 #endif
408
409 // transpose pos
410 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
411
412 #if USE_SIMD16_FRONTEND
413 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug
414 static const float *dummy = reinterpret_cast<const float *>(pBase);
415 #endif
416
417 for (uint32_t c = 0; c < 4; ++c)
418 {
419 #if USE_SIMD16_FRONTEND
420 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
421 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
422 #else
423 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
424 #endif
425 pBase += sizeof(simdscalar);
426 }
427
428 // transpose attribs
429 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
430 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
431 {
432 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
433 for (uint32_t c = 0; c < 4; ++c)
434 {
435 #if USE_SIMD16_FRONTEND
436 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
437 transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
438 #else
439 transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
440 #endif
441 pBase += sizeof(simdscalar);
442 }
443 }
444
445 // transpose user clip distances if enabled
446 if (this->state.rastState.clipDistanceMask & 0xf)
447 {
448 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
449 for (uint32_t c = 0; c < 4; ++c)
450 {
451 #if USE_SIMD16_FRONTEND
452 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
453 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
454 #else
455 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
456 #endif
457 pBase += sizeof(simdscalar);
458 }
459 }
460
461 if (this->state.rastState.clipDistanceMask & 0xf0)
462 {
463 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
464 for (uint32_t c = 0; c < 4; ++c)
465 {
466 #if USE_SIMD16_FRONTEND
467 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
468 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
469 #else
470 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
471 #endif
472 pBase += sizeof(simdscalar);
473 }
474 }
475
476 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
477
478 while (clipPa.GetNextStreamOutput())
479 {
480 do
481 {
482 #if USE_SIMD16_FRONTEND
483 simd16vector attrib_simd16[NumVertsPerPrim];
484 bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
485
486 if (assemble)
487 {
488 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
489
490 simdvector attrib[NumVertsPerPrim];
491 for (uint32_t i = 0; i < NumVertsPerPrim; i += 1)
492 {
493 for (uint32_t j = 0; j < 4; j += 1)
494 {
495 attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
496 }
497 }
498
499 clipPa.useAlternateOffset = false;
500 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
501 }
502 #else
503 simdvector attrib[NumVertsPerPrim];
504 bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
505 if (assemble)
506 {
507 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
508 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
509 }
510 #endif
511 } while (clipPa.NextPrim());
512 }
513 }
514
515 // update global pipeline stat
516 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
517 }
518
519 // execute the clipper stage
520 void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
521 {
522 SWR_ASSERT(pa.pDC != nullptr);
523 SWR_CONTEXT* pContext = pa.pDC->pContext;
524
525 // set up binner based on PA state
526 PFN_PROCESS_PRIMS pfnBinner;
527 switch (pa.binTopology)
528 {
529 case TOP_POINT_LIST:
530 pfnBinner = BinPoints;
531 break;
532 case TOP_LINE_LIST:
533 case TOP_LINE_STRIP:
534 case TOP_LINE_LOOP:
535 case TOP_LINE_LIST_ADJ:
536 case TOP_LISTSTRIP_ADJ:
537 pfnBinner = BinLines;
538 break;
539 default:
540 pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
541 break;
542 };
543
544 // update clipper invocations pipeline stat
545 uint32_t numInvoc = _mm_popcnt_u32(primMask);
546 UPDATE_STAT_FE(CInvocations, numInvoc);
547
548 ComputeClipCodes(prim, viewportIdx);
549
550 // cull prims with NAN coords
551 primMask &= ~ComputeNaNMask(prim);
552
553 // user cull distance cull
554 if (this->state.rastState.cullDistanceMask)
555 {
556 primMask &= ~ComputeUserClipCullMask(pa, prim);
557 }
558
559 // cull prims outside view frustum
560 simdscalar clipIntersection = ComputeClipCodeIntersection();
561 int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
562
563 // skip clipping for points
564 uint32_t clipMask = 0;
565 if (NumVertsPerPrim != 1)
566 {
567 clipMask = primMask & ComputeClipMask();
568 }
569
570 if (clipMask)
571 {
572 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
573 // we have to clip tris, execute the clipper, which will also
574 // call the binner
575 ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
576 AR_END(FEGuardbandClip, 1);
577 }
578 else if (validMask)
579 {
580 // update CPrimitives pipeline state
581 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
582
583 // forward valid prims directly to binner
584 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
585 }
586 }
587
588 private:
589 inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
590 {
591 return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
592 }
593
594 inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
595 {
596 const uint32_t simdVertexStride = sizeof(simdvertex);
597 const uint32_t componentStride = sizeof(simdscalar);
598 const uint32_t attribStride = sizeof(simdvector);
599 const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
600 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
601
602 // step to the simdvertex
603 simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
604
605 // step to the attribute and component
606 vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
607
608 // step to the lane
609 vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
610
611 return vOffsets;
612 }
613
614 // gathers a single component for a given attribute for each SIMD lane
615 inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
616 {
617 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
618 simdscalar vSrc = _mm256_undefined_ps();
619 return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
620 }
621
622 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
623 {
624 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
625
626 uint32_t* pOffsets = (uint32_t*)&vOffsets;
627 float* pSrc = (float*)&vSrc;
628 uint32_t mask = _simd_movemask_ps(vMask);
629 DWORD lane;
630 while (_BitScanForward(&lane, mask))
631 {
632 mask &= ~(1 << lane);
633 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
634 *(float*)pBuf = pSrc[lane];
635 }
636 }
637
638 template<SWR_CLIPCODES ClippingPlane>
639 inline void intersect(
640 const simdscalar& vActiveMask, // active lanes to operate on
641 const simdscalari& s, // index to first edge vertex v0 in pInPts.
642 const simdscalari& p, // index to second edge vertex v1 in pInPts.
643 const simdvector& v1, // vertex 0 position
644 const simdvector& v2, // vertex 1 position
645 simdscalari& outIndex, // output index.
646 const float *pInVerts, // array of all the input positions.
647 uint32_t numInAttribs, // number of attributes per vertex.
648 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
649 {
650 // compute interpolation factor
651 simdscalar t;
652 switch (ClippingPlane)
653 {
654 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
655 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
656 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
657 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
658 case FRUSTUM_NEAR:
659 // DX Znear plane is 0, GL is -w
660 if (this->state.rastState.clipHalfZ)
661 {
662 t = ComputeInterpFactor(v1[2], v2[2]);
663 }
664 else
665 {
666 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
667 }
668 break;
669 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
670 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
671 };
672
673 // interpolate position and store
674 for (uint32_t c = 0; c < 4; ++c)
675 {
676 simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
677 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
678 }
679
680 // interpolate attributes and store
681 for (uint32_t a = 0; a < numInAttribs; ++a)
682 {
683 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
684 for (uint32_t c = 0; c < 4; ++c)
685 {
686 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
687 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
688 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
689 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
690 }
691 }
692
693 // interpolate clip distance if enabled
694 if (this->state.rastState.clipDistanceMask & 0xf)
695 {
696 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
697 for (uint32_t c = 0; c < 4; ++c)
698 {
699 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
700 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
701 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
702 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
703 }
704 }
705
706 if (this->state.rastState.clipDistanceMask & 0xf0)
707 {
708 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
709 for (uint32_t c = 0; c < 4; ++c)
710 {
711 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
712 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
713 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
714 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
715 }
716 }
717 }
718
719 template<SWR_CLIPCODES ClippingPlane>
720 inline simdscalar inside(const simdvector& v)
721 {
722 switch (ClippingPlane)
723 {
724 case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
725 case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
726 case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
727 case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
728 case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
729 case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
730 default:
731 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
732 return _simd_setzero_ps();
733 }
734 }
735
736 template<SWR_CLIPCODES ClippingPlane>
737 simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
738 {
739 simdscalari vCurIndex = _simd_setzero_si();
740 simdscalari vOutIndex = _simd_setzero_si();
741 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
742
743 while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
744 {
745 simdscalari s = vCurIndex;
746 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
747 simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
748 p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
749
750 // gather position
751 simdvector vInPos0, vInPos1;
752 for (uint32_t c = 0; c < 4; ++c)
753 {
754 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
755 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
756 }
757
758 // compute inside mask
759 simdscalar s_in = inside<ClippingPlane>(vInPos0);
760 simdscalar p_in = inside<ClippingPlane>(vInPos1);
761
762 // compute intersection mask (s_in != p_in)
763 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
764 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
765
766 // store s if inside
767 s_in = _simd_and_ps(s_in, vActiveMask);
768 if (!_simd_testz_ps(s_in, s_in))
769 {
770 // store position
771 for (uint32_t c = 0; c < 4; ++c)
772 {
773 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
774 }
775
776 // store attribs
777 for (uint32_t a = 0; a < numInAttribs; ++a)
778 {
779 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
780 for (uint32_t c = 0; c < 4; ++c)
781 {
782 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
783 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
784 }
785 }
786
787 // store clip distance if enabled
788 if (this->state.rastState.clipDistanceMask & 0xf)
789 {
790 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
791 for (uint32_t c = 0; c < 4; ++c)
792 {
793 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
794 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
795 }
796 }
797
798 if (this->state.rastState.clipDistanceMask & 0xf0)
799 {
800 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
801 for (uint32_t c = 0; c < 4; ++c)
802 {
803 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
804 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
805 }
806 }
807
808 // increment outIndex
809 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
810 }
811
812 // compute and store intersection
813 if (!_simd_testz_ps(intersectMask, intersectMask))
814 {
815 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
816
817 // increment outIndex for active lanes
818 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
819 }
820
821 // increment loop index and update active mask
822 vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
823 vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
824 }
825
826 return vOutIndex;
827 }
828
829 template<SWR_CLIPCODES ClippingPlane>
830 simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
831 {
832 simdscalari vCurIndex = _simd_setzero_si();
833 simdscalari vOutIndex = _simd_setzero_si();
834 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
835
836 if (!_simd_testz_ps(vActiveMask, vActiveMask))
837 {
838 simdscalari s = vCurIndex;
839 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
840
841 // gather position
842 simdvector vInPos0, vInPos1;
843 for (uint32_t c = 0; c < 4; ++c)
844 {
845 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
846 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
847 }
848
849 // compute inside mask
850 simdscalar s_in = inside<ClippingPlane>(vInPos0);
851 simdscalar p_in = inside<ClippingPlane>(vInPos1);
852
853 // compute intersection mask (s_in != p_in)
854 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
855 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
856
857 // store s if inside
858 s_in = _simd_and_ps(s_in, vActiveMask);
859 if (!_simd_testz_ps(s_in, s_in))
860 {
861 for (uint32_t c = 0; c < 4; ++c)
862 {
863 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
864 }
865
866 // interpolate attributes and store
867 for (uint32_t a = 0; a < numInAttribs; ++a)
868 {
869 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
870 for (uint32_t c = 0; c < 4; ++c)
871 {
872 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
873 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
874 }
875 }
876
877 // increment outIndex
878 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
879 }
880
881 // compute and store intersection
882 if (!_simd_testz_ps(intersectMask, intersectMask))
883 {
884 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
885
886 // increment outIndex for active lanes
887 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
888 }
889
890 // store p if inside
891 p_in = _simd_and_ps(p_in, vActiveMask);
892 if (!_simd_testz_ps(p_in, p_in))
893 {
894 for (uint32_t c = 0; c < 4; ++c)
895 {
896 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
897 }
898
899 // interpolate attributes and store
900 for (uint32_t a = 0; a < numInAttribs; ++a)
901 {
902 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
903 for (uint32_t c = 0; c < 4; ++c)
904 {
905 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
906 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
907 }
908 }
909
910 // increment outIndex
911 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
912 }
913 }
914
915 return vOutIndex;
916 }
917
918 //////////////////////////////////////////////////////////////////////////
919 /// @brief Vertical clipper. Clips SIMD primitives at a time
920 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
921 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
922 /// @param numAttribs - number of valid input attribs, including position
923 simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
924 {
925 // temp storage
926 float* pTempVerts = (float*)&tlsTempVertices[0];
927
928 // zero out num input verts for non-active lanes
929 simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
930 vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
931
932 // clip prims to frustum
933 simdscalari vNumOutPts;
934 if (NumVertsPerPrim == 3)
935 {
936 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
937 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
938 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
939 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
940 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
941 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
942 }
943 else
944 {
945 SWR_ASSERT(NumVertsPerPrim == 2);
946 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
947 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
948 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
949 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
950 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
951 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
952 }
953
954 // restore num verts for non-clipped, active lanes
955 simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
956 vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
957
958 return vNumOutPts;
959 }
960
961 const uint32_t workerId{ 0 };
962 DRAW_CONTEXT* pDC{ nullptr };
963 const API_STATE& state;
964 simdscalar clipCodes[NumVertsPerPrim];
965 };
966
967
968 // pipeline stage functions
969 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
970 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
971 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);