swr: [rasterizer] Fix Coverity issues reported by Mesa developers.
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 enum SWR_CLIPCODES
36 {
37 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
38 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
39 #define CLIPCODE_SHIFT 23
40 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
41 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
42 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
43 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
44
45 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
46 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
47
48 NEGW = (0x40 << CLIPCODE_SHIFT),
49
50 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
51 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
52 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
53 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
54 };
55
56 #define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
57 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
58
59 void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
60 int *numVerts, float *pOutAttribs);
61
62 INLINE
63 void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes)
64 {
65 clipCodes = _simd_setzero_ps();
66
67 // -w
68 simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
69
70 // FRUSTUM_LEFT
71 simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
72 clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
73
74 // FRUSTUM_TOP
75 vRes = _simd_cmplt_ps(vertex.y, vNegW);
76 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
77
78 // FRUSTUM_RIGHT
79 vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
80 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
81
82 // FRUSTUM_BOTTOM
83 vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
84 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
85
86 if (state.rastState.depthClipEnable)
87 {
88 // FRUSTUM_NEAR
89 // DX clips depth [0..w], GL clips [-w..w]
90 if (type == DX)
91 {
92 vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
93 }
94 else
95 {
96 vRes = _simd_cmplt_ps(vertex.z, vNegW);
97 }
98 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
99
100 // FRUSTUM_FAR
101 vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
102 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
103 }
104
105 // NEGW
106 vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
107 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
108
109 // GUARDBAND_LEFT
110 simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left));
111 vRes = _simd_cmplt_ps(vertex.x, gbMult);
112 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
113
114 // GUARDBAND_TOP
115 gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top));
116 vRes = _simd_cmplt_ps(vertex.y, gbMult);
117 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
118
119 // GUARDBAND_RIGHT
120 gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right));
121 vRes = _simd_cmpgt_ps(vertex.x, gbMult);
122 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
123
124 // GUARDBAND_BOTTOM
125 gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom));
126 vRes = _simd_cmpgt_ps(vertex.y, gbMult);
127 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
128 }
129
130 template<uint32_t NumVertsPerPrim>
131 class Clipper
132 {
133 public:
134 Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
135 workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC))
136 {
137 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
138 }
139
140 void ComputeClipCodes(simdvector vertex[])
141 {
142 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
143 {
144 ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]);
145 }
146 }
147
148 simdscalar ComputeClipCodeIntersection()
149 {
150 simdscalar result = this->clipCodes[0];
151 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
152 {
153 result = _simd_and_ps(result, this->clipCodes[i]);
154 }
155 return result;
156 }
157
158 simdscalar ComputeClipCodeUnion()
159 {
160 simdscalar result = this->clipCodes[0];
161 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
162 {
163 result = _simd_or_ps(result, this->clipCodes[i]);
164 }
165 return result;
166 }
167
168 int ComputeNegWMask()
169 {
170 simdscalar clipCodeUnion = ComputeClipCodeUnion();
171 clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
172 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
173 }
174
175 int ComputeClipMask()
176 {
177 simdscalar clipUnion = ComputeClipCodeUnion();
178 clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
179 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
180 }
181
182 // clipper is responsible for culling any prims with NAN coordinates
183 int ComputeNaNMask(simdvector prim[])
184 {
185 simdscalar vNanMask = _simd_setzero_ps();
186 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
187 {
188 simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
189 vNanMask = _simd_or_ps(vNanMask, vNan01);
190 simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
191 vNanMask = _simd_or_ps(vNanMask, vNan23);
192 }
193
194 return _simd_movemask_ps(vNanMask);
195 }
196
197 int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
198 {
199 uint8_t cullMask = this->state.rastState.cullDistanceMask;
200 simdscalar vClipCullMask = _simd_setzero_ps();
201 DWORD index;
202
203 simdvector vClipCullDistLo[3];
204 simdvector vClipCullDistHi[3];
205
206 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
207 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
208 while (_BitScanForward(&index, cullMask))
209 {
210 cullMask &= ~(1 << index);
211 uint32_t slot = index >> 2;
212 uint32_t component = index & 0x3;
213
214 simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
215 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
216 {
217 simdscalar vCullComp;
218 if (slot == 0)
219 {
220 vCullComp = vClipCullDistLo[e][component];
221 }
222 else
223 {
224 vCullComp = vClipCullDistHi[e][component];
225 }
226
227 // cull if cull distance < 0 || NAN
228 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
229 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
230 }
231 vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
232 }
233
234 // clipper should also discard any primitive with NAN clip distance
235 uint8_t clipMask = this->state.rastState.clipDistanceMask;
236 while (_BitScanForward(&index, clipMask))
237 {
238 clipMask &= ~(1 << index);
239 uint32_t slot = index >> 2;
240 uint32_t component = index & 0x3;
241
242 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
243 {
244 simdscalar vClipComp;
245 if (slot == 0)
246 {
247 vClipComp = vClipCullDistLo[e][component];
248 }
249 else
250 {
251 vClipComp = vClipCullDistHi[e][component];
252 }
253
254 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
255 vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
256 }
257 }
258
259 return _simd_movemask_ps(vClipCullMask);
260 }
261
262 // clip a single primitive
263 int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
264 {
265 OSALIGN(float, 16) inVerts[3 * 4];
266 OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
267
268 // transpose primitive position
269 __m128 verts[3];
270 pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts);
271 _mm_store_ps(&inVerts[0], verts[0]);
272 _mm_store_ps(&inVerts[4], verts[1]);
273 _mm_store_ps(&inVerts[8], verts[2]);
274
275 // transpose attribs
276 uint32_t numScalarAttribs = this->state.linkageCount * 4;
277
278 int idx = 0;
279 DWORD slot = 0;
280 uint32_t mapIdx = 0;
281 uint32_t tmpLinkage = uint32_t(this->state.linkageMask);
282 while (_BitScanForward(&slot, tmpLinkage))
283 {
284 tmpLinkage &= ~(1 << slot);
285 // Compute absolute attrib slot in vertex array
286 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++];
287 __m128 attrib[3]; // triangle attribs (always 4 wide)
288 pa.AssembleSingle(inputSlot, primIndex, attrib);
289 _mm_store_ps(&inAttribs[idx], attrib[0]);
290 _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]);
291 _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]);
292 idx += 4;
293 }
294
295 int numVerts;
296 Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs);
297
298 return numVerts;
299 }
300
301 // clip SIMD primitives
302 void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
303 {
304 // input/output vertex store for clipper
305 simdvertex vertices[7]; // maximum 7 verts generated per triangle
306
307 LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
308 uint32_t provokingVertex = 0;
309 if(pa.binTopology == TOP_TRIANGLE_FAN)
310 {
311 provokingVertex = this->state.frontendState.provokingVertex.triFan;
312 }
313 ///@todo: line topology for wireframe?
314
315 // assemble pos
316 simdvector tmpVector[NumVertsPerPrim];
317 pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
318 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
319 {
320 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
321 }
322
323 // assemble attribs
324 DWORD slot = 0;
325 uint32_t mapIdx = 0;
326 uint32_t tmpLinkage = this->state.linkageMask;
327
328 int32_t maxSlot = -1;
329 while (_BitScanForward(&slot, tmpLinkage))
330 {
331 tmpLinkage &= ~(1 << slot);
332 // Compute absolute attrib slot in vertex array
333 uint32_t mapSlot = this->state.linkageMap[mapIdx++];
334 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
335 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
336
337 pa.Assemble(inputSlot, tmpVector);
338
339 // if constant interpolation enabled for this attribute, assign the provoking
340 // vertex values to all edges
341 if (_bittest(&constantInterpMask, slot))
342 {
343 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
344 {
345 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
346 }
347 }
348 else
349 {
350 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
351 {
352 vertices[i].attrib[inputSlot] = tmpVector[i];
353 }
354 }
355 }
356
357 uint32_t numAttribs = maxSlot + 1;
358
359 simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
360
361 // set up new PA for binning clipped primitives
362 PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
363 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
364 if (NumVertsPerPrim == 3)
365 {
366 pfnBinFunc = BinTriangles;
367 clipTopology = TOP_TRIANGLE_FAN;
368
369 // so that the binner knows to bloat wide points later
370 if (pa.binTopology == TOP_POINT_LIST)
371 clipTopology = TOP_POINT_LIST;
372 }
373 else if (NumVertsPerPrim == 2)
374 {
375 pfnBinFunc = BinLines;
376 clipTopology = TOP_LINE_LIST;
377 }
378 else
379 {
380 SWR_ASSERT(0 && "Unexpected points in clipper.");
381 }
382
383
384 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
385 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
386
387 const simdscalari vOffsets = _mm256_set_epi32(
388 0 * sizeof(simdvertex), // unused lane
389 6 * sizeof(simdvertex),
390 5 * sizeof(simdvertex),
391 4 * sizeof(simdvertex),
392 3 * sizeof(simdvertex),
393 2 * sizeof(simdvertex),
394 1 * sizeof(simdvertex),
395 0 * sizeof(simdvertex));
396
397 // only need to gather 7 verts
398 // @todo dynamic mask based on actual # of verts generated per lane
399 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
400
401 uint32_t numClippedPrims = 0;
402 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
403 {
404 uint32_t numEmittedVerts = pVertexCount[inputPrim];
405 if (numEmittedVerts < NumVertsPerPrim)
406 {
407 continue;
408 }
409 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
410
411 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
412 numClippedPrims += numEmittedPrims;
413
414 // tranpose clipper output so that each lane's vertices are in SIMD order
415 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
416 // for triangle fan
417 simdvertex transposedPrims[2];
418
419 // transpose pos
420 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
421 for (uint32_t c = 0; c < 4; ++c)
422 {
423 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
424 pBase += sizeof(simdscalar);
425 }
426
427 // transpose attribs
428 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
429 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
430 {
431 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
432 for (uint32_t c = 0; c < 4; ++c)
433 {
434 transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
435 pBase += sizeof(simdscalar);
436 }
437 }
438
439 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
440
441 while (clipPa.GetNextStreamOutput())
442 {
443 do
444 {
445 simdvector attrib[NumVertsPerPrim];
446 bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
447 if (assemble)
448 {
449 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
450 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
451 }
452 } while (clipPa.NextPrim());
453 }
454 }
455
456 // update global pipeline stat
457 SWR_CONTEXT* pContext = this->pDC->pContext;
458 UPDATE_STAT(CPrimitives, numClippedPrims);
459 }
460
461 // execute the clipper stage
462 void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
463 {
464 // set up binner based on PA state
465 PFN_PROCESS_PRIMS pfnBinner;
466 switch (pa.binTopology)
467 {
468 case TOP_POINT_LIST:
469 pfnBinner = BinPoints;
470 break;
471 case TOP_LINE_LIST:
472 case TOP_LINE_STRIP:
473 case TOP_LINE_LOOP:
474 case TOP_LINE_LIST_ADJ:
475 case TOP_LISTSTRIP_ADJ:
476 pfnBinner = BinLines;
477 break;
478 default:
479 pfnBinner = BinTriangles;
480 break;
481 };
482
483 // update clipper invocations pipeline stat
484 SWR_CONTEXT* pContext = this->pDC->pContext;
485 uint32_t numInvoc = _mm_popcnt_u32(primMask);
486 UPDATE_STAT(CInvocations, numInvoc);
487
488 ComputeClipCodes(prim);
489
490 // cull prims with NAN coords
491 primMask &= ~ComputeNaNMask(prim);
492
493 // user cull distance cull
494 if (this->state.rastState.cullDistanceMask)
495 {
496 primMask &= ~ComputeUserClipCullMask(pa, prim);
497 }
498
499 // cull prims outside view frustum
500 simdscalar clipIntersection = ComputeClipCodeIntersection();
501 int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
502
503 // skip clipping for points
504 uint32_t clipMask = 0;
505 if (NumVertsPerPrim != 1)
506 {
507 clipMask = primMask & ComputeClipMask();
508 }
509
510 if (clipMask)
511 {
512 RDTSC_START(FEGuardbandClip);
513 // we have to clip tris, execute the clipper, which will also
514 // call the binner
515 ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
516 RDTSC_STOP(FEGuardbandClip, 1, 0);
517 }
518 else if (validMask)
519 {
520 // update CPrimitives pipeline state
521 SWR_CONTEXT* pContext = this->pDC->pContext;
522 UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
523
524 // forward valid prims directly to binner
525 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
526 }
527 }
528
529 private:
530 inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
531 {
532 return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
533 }
534
535 inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
536 {
537 const uint32_t simdVertexStride = sizeof(simdvertex);
538 const uint32_t componentStride = sizeof(simdscalar);
539 const uint32_t attribStride = sizeof(simdvector);
540 const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
541 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
542
543 // step to the simdvertex
544 simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
545
546 // step to the attribute and component
547 vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
548
549 // step to the lane
550 vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
551
552 return vOffsets;
553 }
554
555 // gathers a single component for a given attribute for each SIMD lane
556 inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
557 {
558 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
559 simdscalar vSrc = _mm256_undefined_ps();
560 return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
561 }
562
563 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
564 {
565 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
566
567 uint32_t* pOffsets = (uint32_t*)&vOffsets;
568 float* pSrc = (float*)&vSrc;
569 uint32_t mask = _simd_movemask_ps(vMask);
570 DWORD lane;
571 while (_BitScanForward(&lane, mask))
572 {
573 mask &= ~(1 << lane);
574 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
575 *(float*)pBuf = pSrc[lane];
576 }
577 }
578
579 template<SWR_CLIPCODES ClippingPlane>
580 inline void intersect(
581 const simdscalar& vActiveMask, // active lanes to operate on
582 const simdscalari& s, // index to first edge vertex v0 in pInPts.
583 const simdscalari& p, // index to second edge vertex v1 in pInPts.
584 const simdvector& v1, // vertex 0 position
585 const simdvector& v2, // vertex 1 position
586 simdscalari& outIndex, // output index.
587 const float *pInVerts, // array of all the input positions.
588 uint32_t numInAttribs, // number of attributes per vertex.
589 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
590 {
591 // compute interpolation factor
592 simdscalar t;
593 switch (ClippingPlane)
594 {
595 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
596 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
597 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
598 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
599 case FRUSTUM_NEAR:
600 // DX Znear plane is 0, GL is -w
601 if (this->driverType == DX)
602 {
603 t = ComputeInterpFactor(v1[2], v2[2]);
604 }
605 else
606 {
607 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
608 }
609 break;
610 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
611 default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
612 };
613
614 // interpolate position and store
615 for (uint32_t c = 0; c < 4; ++c)
616 {
617 simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
618 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
619 }
620
621 // interpolate attributes and store
622 for (uint32_t a = 0; a < numInAttribs; ++a)
623 {
624 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
625 for (uint32_t c = 0; c < 4; ++c)
626 {
627 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
628 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
629 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
630 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
631 }
632 }
633 }
634
635 template<SWR_CLIPCODES ClippingPlane>
636 inline simdscalar inside(const simdvector& v)
637 {
638 switch (ClippingPlane)
639 {
640 case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
641 case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
642 case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
643 case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
644 case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
645 case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
646 default:
647 SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
648 return _simd_setzero_ps();
649 }
650 }
651
652 template<SWR_CLIPCODES ClippingPlane>
653 simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
654 {
655 simdscalari vCurIndex = _simd_setzero_si();
656 simdscalari vOutIndex = _simd_setzero_si();
657 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
658
659 while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
660 {
661 simdscalari s = vCurIndex;
662 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
663 simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
664 p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
665
666 // gather position
667 simdvector vInPos0, vInPos1;
668 for (uint32_t c = 0; c < 4; ++c)
669 {
670 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
671 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
672 }
673
674 // compute inside mask
675 simdscalar s_in = inside<ClippingPlane>(vInPos0);
676 simdscalar p_in = inside<ClippingPlane>(vInPos1);
677
678 // compute intersection mask (s_in != p_in)
679 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
680 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
681
682 // store s if inside
683 s_in = _simd_and_ps(s_in, vActiveMask);
684 if (!_simd_testz_ps(s_in, s_in))
685 {
686 // store position
687 for (uint32_t c = 0; c < 4; ++c)
688 {
689 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
690 }
691
692 // store attribs
693 for (uint32_t a = 0; a < numInAttribs; ++a)
694 {
695 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
696 for (uint32_t c = 0; c < 4; ++c)
697 {
698 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
699 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
700 }
701 }
702
703 // increment outIndex
704 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
705 }
706
707 // compute and store intersection
708 if (!_simd_testz_ps(intersectMask, intersectMask))
709 {
710 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
711
712 // increment outIndex for active lanes
713 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
714 }
715
716 // increment loop index and update active mask
717 vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
718 vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
719 }
720
721 return vOutIndex;
722 }
723
724 template<SWR_CLIPCODES ClippingPlane>
725 simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
726 {
727 simdscalari vCurIndex = _simd_setzero_si();
728 simdscalari vOutIndex = _simd_setzero_si();
729 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
730
731 if (!_simd_testz_ps(vActiveMask, vActiveMask))
732 {
733 simdscalari s = vCurIndex;
734 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
735
736 // gather position
737 simdvector vInPos0, vInPos1;
738 for (uint32_t c = 0; c < 4; ++c)
739 {
740 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
741 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
742 }
743
744 // compute inside mask
745 simdscalar s_in = inside<ClippingPlane>(vInPos0);
746 simdscalar p_in = inside<ClippingPlane>(vInPos1);
747
748 // compute intersection mask (s_in != p_in)
749 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
750 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
751
752 // store s if inside
753 s_in = _simd_and_ps(s_in, vActiveMask);
754 if (!_simd_testz_ps(s_in, s_in))
755 {
756 for (uint32_t c = 0; c < 4; ++c)
757 {
758 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
759 }
760
761 // interpolate attributes and store
762 for (uint32_t a = 0; a < numInAttribs; ++a)
763 {
764 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
765 for (uint32_t c = 0; c < 4; ++c)
766 {
767 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
768 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
769 }
770 }
771
772 // increment outIndex
773 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
774 }
775
776 // compute and store intersection
777 if (!_simd_testz_ps(intersectMask, intersectMask))
778 {
779 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
780
781 // increment outIndex for active lanes
782 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
783 }
784
785 // store p if inside
786 p_in = _simd_and_ps(p_in, vActiveMask);
787 if (!_simd_testz_ps(p_in, p_in))
788 {
789 for (uint32_t c = 0; c < 4; ++c)
790 {
791 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
792 }
793
794 // interpolate attributes and store
795 for (uint32_t a = 0; a < numInAttribs; ++a)
796 {
797 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
798 for (uint32_t c = 0; c < 4; ++c)
799 {
800 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
801 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
802 }
803 }
804
805 // increment outIndex
806 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
807 }
808 }
809
810 return vOutIndex;
811 }
812
813 //////////////////////////////////////////////////////////////////////////
814 /// @brief Vertical clipper. Clips SIMD primitives at a time
815 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
816 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
817 /// @param numAttribs - number of valid input attribs, including position
818 simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
819 {
820 // temp storage
821 simdvertex tempVertices[7];
822 float* pTempVerts = (float*)&tempVertices[0];
823
824 // zero out num input verts for non-active lanes
825 simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
826 vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
827
828 // clip prims to frustum
829 simdscalari vNumOutPts;
830 if (NumVertsPerPrim == 3)
831 {
832 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
833 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
834 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
835 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
836 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
837 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
838 }
839 else
840 {
841 SWR_ASSERT(NumVertsPerPrim == 2);
842 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
843 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
844 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
845 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
846 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
847 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
848 }
849
850 // restore num verts for non-clipped, active lanes
851 simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
852 vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
853
854 return vNumOutPts;
855 }
856
857 const uint32_t workerId{ 0 };
858 const DRIVER_TYPE driverType{ DX };
859 DRAW_CONTEXT* pDC{ nullptr };
860 const API_STATE& state;
861 simdscalar clipCodes[NumVertsPerPrim];
862 };
863
864
865 // pipeline stage functions
866 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
867 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
868 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);