bf16792a0a608e0136ef63aba9eb2a8d657d4194
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD simd16vertex tlsTempVertices_simd16[7];
39 #endif
40
41 enum SWR_CLIPCODES
42 {
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
47 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
48 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
49 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
50
51 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
52 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
53
54 NEGW = (0x40 << CLIPCODE_SHIFT),
55
56 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
57 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
58 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
59 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
60 };
61
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63
64 INLINE
65 void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
66 {
67 clipCodes = _simd_setzero_ps();
68
69 // -w
70 simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
71
72 // FRUSTUM_LEFT
73 simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
74 clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
75
76 // FRUSTUM_TOP
77 vRes = _simd_cmplt_ps(vertex.y, vNegW);
78 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
79
80 // FRUSTUM_RIGHT
81 vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
82 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
83
84 // FRUSTUM_BOTTOM
85 vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
86 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
87
88 if (state.rastState.depthClipEnable)
89 {
90 // FRUSTUM_NEAR
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state.rastState.clipHalfZ)
93 {
94 vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
95 }
96 else
97 {
98 vRes = _simd_cmplt_ps(vertex.z, vNegW);
99 }
100 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
101
102 // FRUSTUM_FAR
103 vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
104 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
105 }
106
107 // NEGW
108 vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
109 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
110
111 // GUARDBAND_LEFT
112 simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
113 vRes = _simd_cmplt_ps(vertex.x, gbMult);
114 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
115
116 // GUARDBAND_TOP
117 gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
118 vRes = _simd_cmplt_ps(vertex.y, gbMult);
119 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
120
121 // GUARDBAND_RIGHT
122 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
123 vRes = _simd_cmpgt_ps(vertex.x, gbMult);
124 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
125
126 // GUARDBAND_BOTTOM
127 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
128 vRes = _simd_cmpgt_ps(vertex.y, gbMult);
129 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
130 }
131
132 #if USE_SIMD16_FRONTEND
133 INLINE
134 void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari viewportIndexes)
135 {
136 clipCodes = _simd16_setzero_ps();
137
138 // -w
139 simd16scalar vNegW = _simd16_mul_ps(vertex.w, _simd16_set1_ps(-1.0f));
140
141 // FRUSTUM_LEFT
142 simd16scalar vRes = _simd16_cmplt_ps(vertex.x, vNegW);
143 clipCodes = _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_LEFT)));
144
145 // FRUSTUM_TOP
146 vRes = _simd16_cmplt_ps(vertex.y, vNegW);
147 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_TOP))));
148
149 // FRUSTUM_RIGHT
150 vRes = _simd16_cmpgt_ps(vertex.x, vertex.w);
151 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_RIGHT))));
152
153 // FRUSTUM_BOTTOM
154 vRes = _simd16_cmpgt_ps(vertex.y, vertex.w);
155 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_BOTTOM))));
156
157 if (state.rastState.depthClipEnable)
158 {
159 // FRUSTUM_NEAR
160 // DX clips depth [0..w], GL clips [-w..w]
161 if (state.rastState.clipHalfZ)
162 {
163 vRes = _simd16_cmplt_ps(vertex.z, _simd16_setzero_ps());
164 }
165 else
166 {
167 vRes = _simd16_cmplt_ps(vertex.z, vNegW);
168 }
169 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_NEAR))));
170
171 // FRUSTUM_FAR
172 vRes = _simd16_cmpgt_ps(vertex.z, vertex.w);
173 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_FAR))));
174 }
175
176 // NEGW
177 vRes = _simd16_cmple_ps(vertex.w, _simd16_setzero_ps());
178 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(NEGW))));
179
180 // GUARDBAND_LEFT
181 simd16scalar gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
182 vRes = _simd16_cmplt_ps(vertex.x, gbMult);
183 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_LEFT))));
184
185 // GUARDBAND_TOP
186 gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
187 vRes = _simd16_cmplt_ps(vertex.y, gbMult);
188 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_TOP))));
189
190 // GUARDBAND_RIGHT
191 gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
192 vRes = _simd16_cmpgt_ps(vertex.x, gbMult);
193 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_RIGHT))));
194
195 // GUARDBAND_BOTTOM
196 gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
197 vRes = _simd16_cmpgt_ps(vertex.y, gbMult);
198 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_BOTTOM))));
199 }
200
201 #endif
202 template<uint32_t NumVertsPerPrim>
203 class Clipper
204 {
205 public:
206 Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
207 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
208 {
209 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
210 }
211
212 void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
213 {
214 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
215 {
216 ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
217 }
218 }
219
220 #if USE_SIMD16_FRONTEND
221 void ComputeClipCodes(simd16vector vertex[], simd16scalari viewportIndexes)
222 {
223 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
224 {
225 ::ComputeClipCodes(this->state, vertex[i], this->clipCodes_simd16[i], viewportIndexes);
226 }
227 }
228
229 #endif
230 simdscalar ComputeClipCodeIntersection()
231 {
232 simdscalar result = this->clipCodes[0];
233 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
234 {
235 result = _simd_and_ps(result, this->clipCodes[i]);
236 }
237 return result;
238 }
239
240 #if USE_SIMD16_FRONTEND
241 simd16scalar ComputeClipCodeIntersection_simd16()
242 {
243 simd16scalar result = this->clipCodes_simd16[0];
244 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
245 {
246 result = _simd16_and_ps(result, this->clipCodes_simd16[i]);
247 }
248 return result;
249 }
250
251 #endif
252 simdscalar ComputeClipCodeUnion()
253 {
254 simdscalar result = this->clipCodes[0];
255 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
256 {
257 result = _simd_or_ps(result, this->clipCodes[i]);
258 }
259 return result;
260 }
261
262 #if USE_SIMD16_FRONTEND
263 simd16scalar ComputeClipCodeUnion_simd16()
264 {
265 simd16scalar result = this->clipCodes_simd16[0];
266 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
267 {
268 result = _simd16_or_ps(result, this->clipCodes_simd16[i]);
269 }
270 return result;
271 }
272
273 #endif
274 int ComputeNegWMask()
275 {
276 simdscalar clipCodeUnion = ComputeClipCodeUnion();
277 clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
278 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
279 }
280
281 int ComputeClipMask()
282 {
283 simdscalar clipUnion = ComputeClipCodeUnion();
284 clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
285 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
286 }
287
288 #if USE_SIMD16_FRONTEND
289 int ComputeClipMask_simd16()
290 {
291 simd16scalar clipUnion = ComputeClipCodeUnion_simd16();
292 clipUnion = _simd16_and_ps(clipUnion, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_CLIP_MASK)));
293 return _simd16_movemask_ps(_simd16_cmpneq_ps(clipUnion, _simd16_setzero_ps()));
294 }
295
296 #endif
297 // clipper is responsible for culling any prims with NAN coordinates
298 int ComputeNaNMask(simdvector prim[])
299 {
300 simdscalar vNanMask = _simd_setzero_ps();
301 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
302 {
303 simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
304 vNanMask = _simd_or_ps(vNanMask, vNan01);
305 simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
306 vNanMask = _simd_or_ps(vNanMask, vNan23);
307 }
308
309 return _simd_movemask_ps(vNanMask);
310 }
311
312 #if USE_SIMD16_FRONTEND
313 int ComputeNaNMask(simd16vector prim[])
314 {
315 simd16scalar vNanMask = _simd16_setzero_ps();
316 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
317 {
318 simd16scalar vNan01 = _simd16_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
319 vNanMask = _simd16_or_ps(vNanMask, vNan01);
320 simd16scalar vNan23 = _simd16_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
321 vNanMask = _simd16_or_ps(vNanMask, vNan23);
322 }
323
324 return _simd16_movemask_ps(vNanMask);
325 }
326
327 #endif
328 int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
329 {
330 uint8_t cullMask = this->state.rastState.cullDistanceMask;
331 simdscalar vClipCullMask = _simd_setzero_ps();
332 DWORD index;
333
334 simdvector vClipCullDistLo[3];
335 simdvector vClipCullDistHi[3];
336
337 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
338 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
339 while (_BitScanForward(&index, cullMask))
340 {
341 cullMask &= ~(1 << index);
342 uint32_t slot = index >> 2;
343 uint32_t component = index & 0x3;
344
345 simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
346 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
347 {
348 simdscalar vCullComp;
349 if (slot == 0)
350 {
351 vCullComp = vClipCullDistLo[e][component];
352 }
353 else
354 {
355 vCullComp = vClipCullDistHi[e][component];
356 }
357
358 // cull if cull distance < 0 || NAN
359 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
360 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
361 }
362 vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
363 }
364
365 // clipper should also discard any primitive with NAN clip distance
366 uint8_t clipMask = this->state.rastState.clipDistanceMask;
367 while (_BitScanForward(&index, clipMask))
368 {
369 clipMask &= ~(1 << index);
370 uint32_t slot = index >> 2;
371 uint32_t component = index & 0x3;
372
373 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
374 {
375 simdscalar vClipComp;
376 if (slot == 0)
377 {
378 vClipComp = vClipCullDistLo[e][component];
379 }
380 else
381 {
382 vClipComp = vClipCullDistHi[e][component];
383 }
384
385 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
386 vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
387 }
388 }
389
390 return _simd_movemask_ps(vClipCullMask);
391 }
392
393 #if USE_SIMD16_FRONTEND
394 int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[])
395 {
396 uint8_t cullMask = this->state.rastState.cullDistanceMask;
397 simd16scalar vClipCullMask = _simd16_setzero_ps();
398
399 simd16vector vClipCullDistLo[3];
400 simd16vector vClipCullDistHi[3];
401
402 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
403 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
404
405 DWORD index;
406 while (_BitScanForward(&index, cullMask))
407 {
408 cullMask &= ~(1 << index);
409 uint32_t slot = index >> 2;
410 uint32_t component = index & 0x3;
411
412 simd16scalar vCullMaskElem = _simd16_set1_ps(-1.0f);
413 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
414 {
415 simd16scalar vCullComp;
416 if (slot == 0)
417 {
418 vCullComp = vClipCullDistLo[e][component];
419 }
420 else
421 {
422 vCullComp = vClipCullDistHi[e][component];
423 }
424
425 // cull if cull distance < 0 || NAN
426 simd16scalar vCull = _simd16_cmp_ps(_simd16_setzero_ps(), vCullComp, _CMP_NLE_UQ);
427 vCullMaskElem = _simd16_and_ps(vCullMaskElem, vCull);
428 }
429 vClipCullMask = _simd16_or_ps(vClipCullMask, vCullMaskElem);
430 }
431
432 // clipper should also discard any primitive with NAN clip distance
433 uint8_t clipMask = this->state.rastState.clipDistanceMask;
434 while (_BitScanForward(&index, clipMask))
435 {
436 clipMask &= ~(1 << index);
437 uint32_t slot = index >> 2;
438 uint32_t component = index & 0x3;
439
440 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
441 {
442 simd16scalar vClipComp;
443 if (slot == 0)
444 {
445 vClipComp = vClipCullDistLo[e][component];
446 }
447 else
448 {
449 vClipComp = vClipCullDistHi[e][component];
450 }
451
452 simd16scalar vClip = _simd16_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
453 vClipCullMask = _simd16_or_ps(vClipCullMask, vClip);
454 }
455 }
456
457 return _simd16_movemask_ps(vClipCullMask);
458 }
459
460 #endif
461 // clip SIMD primitives
462 void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
463 {
464 // input/output vertex store for clipper
465 simdvertex vertices[7]; // maximum 7 verts generated per triangle
466
467 LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
468 uint32_t provokingVertex = 0;
469 if(pa.binTopology == TOP_TRIANGLE_FAN)
470 {
471 provokingVertex = this->state.frontendState.provokingVertex.triFan;
472 }
473 ///@todo: line topology for wireframe?
474
475 // assemble pos
476 simdvector tmpVector[NumVertsPerPrim];
477 pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
478 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
479 {
480 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
481 }
482
483 // assemble attribs
484 const SWR_BACKEND_STATE& backendState = this->state.backendState;
485
486 int32_t maxSlot = -1;
487 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
488 {
489 // Compute absolute attrib slot in vertex array
490 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
491 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
492 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
493
494 pa.Assemble(inputSlot, tmpVector);
495
496 // if constant interpolation enabled for this attribute, assign the provoking
497 // vertex values to all edges
498 if (_bittest(&constantInterpMask, slot))
499 {
500 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
501 {
502 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
503 }
504 }
505 else
506 {
507 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
508 {
509 vertices[i].attrib[inputSlot] = tmpVector[i];
510 }
511 }
512 }
513
514 // assemble user clip distances if enabled
515 if (this->state.rastState.clipDistanceMask & 0xf)
516 {
517 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
518 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
519 {
520 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
521 }
522 }
523
524 if (this->state.rastState.clipDistanceMask & 0xf0)
525 {
526 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
527 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
528 {
529 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
530 }
531 }
532
533 uint32_t numAttribs = maxSlot + 1;
534
535 simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
536
537 // set up new PA for binning clipped primitives
538 PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
539 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
540 if (NumVertsPerPrim == 3)
541 {
542 pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
543 clipTopology = TOP_TRIANGLE_FAN;
544
545 // so that the binner knows to bloat wide points later
546 if (pa.binTopology == TOP_POINT_LIST)
547 clipTopology = TOP_POINT_LIST;
548
549 }
550 else if (NumVertsPerPrim == 2)
551 {
552 pfnBinFunc = BinLines;
553 clipTopology = TOP_LINE_LIST;
554 }
555 else
556 {
557 SWR_ASSERT(0 && "Unexpected points in clipper.");
558 }
559
560 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
561 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
562
563 const simdscalari vOffsets = _mm256_set_epi32(
564 0 * sizeof(simdvertex), // unused lane
565 6 * sizeof(simdvertex),
566 5 * sizeof(simdvertex),
567 4 * sizeof(simdvertex),
568 3 * sizeof(simdvertex),
569 2 * sizeof(simdvertex),
570 1 * sizeof(simdvertex),
571 0 * sizeof(simdvertex));
572
573 // only need to gather 7 verts
574 // @todo dynamic mask based on actual # of verts generated per lane
575 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
576
577 uint32_t numClippedPrims = 0;
578 #if USE_SIMD16_FRONTEND
579 const uint32_t numPrims = pa.NumPrims();
580 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
581
582 SWR_ASSERT(numPrims <= numPrims_lo);
583
584 for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim)
585 #else
586 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
587 #endif
588 {
589 uint32_t numEmittedVerts = pVertexCount[inputPrim];
590 if (numEmittedVerts < NumVertsPerPrim)
591 {
592 continue;
593 }
594 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
595
596 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
597 numClippedPrims += numEmittedPrims;
598
599 // tranpose clipper output so that each lane's vertices are in SIMD order
600 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
601 // for triangle fan
602 #if USE_SIMD16_FRONTEND
603 simd16vertex transposedPrims[2];
604 #else
605 simdvertex transposedPrims[2];
606 #endif
607
608 // transpose pos
609 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
610
611 #if USE_SIMD16_FRONTEND
612 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
613 static const float *dummy = reinterpret_cast<const float *>(pBase);
614 #endif
615
616 for (uint32_t c = 0; c < 4; ++c)
617 {
618 #if USE_SIMD16_FRONTEND
619 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
620 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
621 #else
622 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
623 #endif
624 pBase += sizeof(simdscalar);
625 }
626
627 // transpose attribs
628 pBase = (uint8_t*)(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
629 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
630 {
631 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
632 for (uint32_t c = 0; c < 4; ++c)
633 {
634 #if USE_SIMD16_FRONTEND
635 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
636 transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
637 #else
638 transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
639 #endif
640 pBase += sizeof(simdscalar);
641 }
642 }
643
644 // transpose user clip distances if enabled
645 if (this->state.rastState.clipDistanceMask & 0xf)
646 {
647 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
648 for (uint32_t c = 0; c < 4; ++c)
649 {
650 #if USE_SIMD16_FRONTEND
651 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
652 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
653 #else
654 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
655 #endif
656 pBase += sizeof(simdscalar);
657 }
658 }
659
660 if (this->state.rastState.clipDistanceMask & 0xf0)
661 {
662 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
663 for (uint32_t c = 0; c < 4; ++c)
664 {
665 #if USE_SIMD16_FRONTEND
666 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
667 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
668 #else
669 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
670 #endif
671 pBase += sizeof(simdscalar);
672 }
673 }
674
675 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
676
677 while (clipPa.GetNextStreamOutput())
678 {
679 do
680 {
681 #if USE_SIMD16_FRONTEND
682 simd16vector attrib_simd16[NumVertsPerPrim];
683 bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
684
685 if (assemble)
686 {
687 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
688
689 simdvector attrib[NumVertsPerPrim];
690 for (uint32_t i = 0; i < NumVertsPerPrim; i += 1)
691 {
692 for (uint32_t j = 0; j < 4; j += 1)
693 {
694 attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
695 }
696 }
697
698 clipPa.useAlternateOffset = false;
699 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
700 }
701 #else
702 simdvector attrib[NumVertsPerPrim];
703 bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
704 if (assemble)
705 {
706 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
707 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
708 }
709 #endif
710 } while (clipPa.NextPrim());
711 }
712 }
713
714 // update global pipeline stat
715 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
716 }
717
718 #if USE_SIMD16_FRONTEND
719 void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId)
720 {
721 // input/output vertex store for clipper
722 simd16vertex vertices[7]; // maximum 7 verts generated per triangle
723
724 LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
725 uint32_t provokingVertex = 0;
726 if (pa.binTopology == TOP_TRIANGLE_FAN)
727 {
728 provokingVertex = this->state.frontendState.provokingVertex.triFan;
729 }
730 ///@todo: line topology for wireframe?
731
732 // assemble pos
733 simd16vector tmpVector[NumVertsPerPrim];
734 pa.Assemble_simd16(VERTEX_POSITION_SLOT, tmpVector);
735 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
736 {
737 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
738 }
739
740 // assemble attribs
741 const SWR_BACKEND_STATE& backendState = this->state.backendState;
742
743 int32_t maxSlot = -1;
744 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
745 {
746 // Compute absolute attrib slot in vertex array
747 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
748 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
749 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
750
751 pa.Assemble_simd16(inputSlot, tmpVector);
752
753 // if constant interpolation enabled for this attribute, assign the provoking
754 // vertex values to all edges
755 if (_bittest(&constantInterpMask, slot))
756 {
757 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
758 {
759 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
760 }
761 }
762 else
763 {
764 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
765 {
766 vertices[i].attrib[inputSlot] = tmpVector[i];
767 }
768 }
769 }
770
771 // assemble user clip distances if enabled
772 if (this->state.rastState.clipDistanceMask & 0xf)
773 {
774 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
775 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
776 {
777 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
778 }
779 }
780
781 if (this->state.rastState.clipDistanceMask & 0xf0)
782 {
783 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
784 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
785 {
786 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
787 }
788 }
789
790 uint32_t numAttribs = maxSlot + 1;
791
792 simd16scalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
793
794 // set up new PA for binning clipped primitives
795 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc = nullptr;
796 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
797 if (NumVertsPerPrim == 3)
798 {
799 pfnBinFunc = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0));
800 clipTopology = TOP_TRIANGLE_FAN;
801
802 // so that the binner knows to bloat wide points later
803 if (pa.binTopology == TOP_POINT_LIST)
804 clipTopology = TOP_POINT_LIST;
805
806 }
807 else if (NumVertsPerPrim == 2)
808 {
809 pfnBinFunc = BinLines_simd16;
810 clipTopology = TOP_LINE_LIST;
811 }
812 else
813 {
814 SWR_ASSERT(0 && "Unexpected points in clipper.");
815 }
816
817 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
818 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
819
820 const simdscalari vOffsets = _simd_set_epi32(
821 0 * sizeof(simd16vertex), // unused lane
822 6 * sizeof(simd16vertex),
823 5 * sizeof(simd16vertex),
824 4 * sizeof(simd16vertex),
825 3 * sizeof(simd16vertex),
826 2 * sizeof(simd16vertex),
827 1 * sizeof(simd16vertex),
828 0 * sizeof(simd16vertex));
829
830 // only need to gather 7 verts
831 // @todo dynamic mask based on actual # of verts generated per lane
832 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
833
834 uint32_t numClippedPrims = 0;
835
836 // tranpose clipper output so that each lane's vertices are in SIMD order
837 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
838 // for triangle fan
839
840 #if defined(_DEBUG)
841 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
842 simd16vertex *transposedPrims = reinterpret_cast<simd16vertex *>(malloc(sizeof(simd16vertex) * 2));
843
844 #else
845 simd16vertex transposedPrims[2];
846
847 #endif
848 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
849 {
850 uint32_t numEmittedVerts = pVertexCount[inputPrim];
851 if (numEmittedVerts < NumVertsPerPrim)
852 {
853 continue;
854 }
855 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
856
857 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
858 numClippedPrims += numEmittedPrims;
859
860 // tranpose clipper output so that each lane's vertices are in SIMD order
861 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
862 // for triangle fan
863
864 // transpose pos
865 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
866
867 #if 0
868 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
869 static const float *dummy = reinterpret_cast<const float *>(pBase);
870 #endif
871
872 for (uint32_t c = 0; c < 4; ++c)
873 {
874 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
875 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
876 pBase += sizeof(simd16scalar);
877 }
878
879 // transpose attribs
880 pBase = (uint8_t*)(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
881 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
882 {
883 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
884 for (uint32_t c = 0; c < 4; ++c)
885 {
886 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
887 transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
888 pBase += sizeof(simd16scalar);
889 }
890 }
891
892 // transpose user clip distances if enabled
893 if (this->state.rastState.clipDistanceMask & 0xf)
894 {
895 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
896 for (uint32_t c = 0; c < 4; ++c)
897 {
898 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
899 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
900 pBase += sizeof(simd16scalar);
901 }
902 }
903
904 if (this->state.rastState.clipDistanceMask & 0xf0)
905 {
906 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
907 for (uint32_t c = 0; c < 4; ++c)
908 {
909 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
910 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
911 pBase += sizeof(simd16scalar);
912 }
913 }
914
915 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
916
917 while (clipPa.GetNextStreamOutput())
918 {
919 do
920 {
921 simd16vector attrib[NumVertsPerPrim];
922 bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib);
923
924 if (assemble)
925 {
926 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff };
927
928 clipPa.useAlternateOffset = false;
929 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]));
930 }
931
932 } while (clipPa.NextPrim());
933 }
934 }
935
936 #if defined(_DEBUG)
937 free(transposedPrims);
938
939 #endif
940 // update global pipeline stat
941 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
942 }
943
944 #endif
945 // execute the clipper stage
946 void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
947 {
948 SWR_ASSERT(this->pDC != nullptr);
949 SWR_CONTEXT* pContext = this->pDC->pContext;
950 const API_STATE& apiState = this->pDC->pState->state;
951
952 // set up binner based on PA state
953 PFN_PROCESS_PRIMS pfnBinner;
954 switch (pa.binTopology)
955 {
956 case TOP_POINT_LIST:
957 pfnBinner = BinPoints;
958 break;
959 case TOP_LINE_LIST:
960 case TOP_LINE_STRIP:
961 case TOP_LINE_LOOP:
962 case TOP_LINE_LIST_ADJ:
963 case TOP_LISTSTRIP_ADJ:
964 pfnBinner = BinLines;
965 break;
966 default:
967 pfnBinner = GetBinTrianglesFunc((apiState.rastState.conservativeRast > 0));
968 break;
969 };
970
971 // update clipper invocations pipeline stat
972 uint32_t numInvoc = _mm_popcnt_u32(primMask);
973 UPDATE_STAT_FE(CInvocations, numInvoc);
974
975 // Read back viewport index if required
976 simdscalari viewportIdx = _simd_set1_epi32(0);
977 if (state.backendState.readViewportArrayIndex)
978 {
979 simdvector vpiAttrib[NumVertsPerPrim];
980 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
981 simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
982
983 // OOB indices => forced to zero.
984 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
985 simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
986 viewportIdx = _simd_and_si(vClearMask, vpai);
987 }
988
989 ComputeClipCodes(prim, viewportIdx);
990
991 // cull prims with NAN coords
992 primMask &= ~ComputeNaNMask(prim);
993
994 // user cull distance cull
995 if (this->state.rastState.cullDistanceMask)
996 {
997 primMask &= ~ComputeUserClipCullMask(pa, prim);
998 }
999
1000 // cull prims outside view frustum
1001 simdscalar clipIntersection = ComputeClipCodeIntersection();
1002 int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
1003
1004 // skip clipping for points
1005 uint32_t clipMask = 0;
1006 if (NumVertsPerPrim != 1)
1007 {
1008 clipMask = primMask & ComputeClipMask();
1009 }
1010
1011 if (clipMask)
1012 {
1013 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
1014 // we have to clip tris, execute the clipper, which will also
1015 // call the binner
1016 ClipSimd(_simd_vmask_ps(primMask), _simd_vmask_ps(clipMask), pa, primId);
1017 AR_END(FEGuardbandClip, 1);
1018 }
1019 else if (validMask)
1020 {
1021 // update CPrimitives pipeline state
1022 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
1023
1024 // forward valid prims directly to binner
1025 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
1026 }
1027 }
1028
1029 #if USE_SIMD16_FRONTEND
1030 void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId)
1031 {
1032 SWR_ASSERT(pa.pDC != nullptr);
1033 SWR_CONTEXT* pContext = pa.pDC->pContext;
1034
1035 // set up binner based on PA state
1036 PFN_PROCESS_PRIMS_SIMD16 pfnBinner;
1037 switch (pa.binTopology)
1038 {
1039 case TOP_POINT_LIST:
1040 pfnBinner = BinPoints_simd16;
1041 break;
1042 case TOP_LINE_LIST:
1043 case TOP_LINE_STRIP:
1044 case TOP_LINE_LOOP:
1045 case TOP_LINE_LIST_ADJ:
1046 case TOP_LISTSTRIP_ADJ:
1047 pfnBinner = BinLines_simd16;
1048 break;
1049 default:
1050 pfnBinner = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0));
1051 break;
1052 };
1053
1054 // update clipper invocations pipeline stat
1055 uint32_t numInvoc = _mm_popcnt_u32(primMask);
1056 UPDATE_STAT_FE(CInvocations, numInvoc);
1057
1058 // Read back viewport index if required
1059 simd16scalari viewportIdx = _simd16_set1_epi32(0);
1060 if (state.backendState.readViewportArrayIndex)
1061 {
1062 simd16vector vpiAttrib[NumVertsPerPrim];
1063 pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
1064
1065 // OOB indices => forced to zero.
1066 simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1067 simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1068 simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
1069 viewportIdx = _simd16_and_si(vClearMask, vpai);
1070 }
1071 ComputeClipCodes(prim, viewportIdx);
1072
1073 // cull prims with NAN coords
1074 primMask &= ~ComputeNaNMask(prim);
1075
1076 // user cull distance cull
1077 if (this->state.rastState.cullDistanceMask)
1078 {
1079 primMask &= ~ComputeUserClipCullMask(pa, prim);
1080 }
1081
1082 // cull prims outside view frustum
1083 simd16scalar clipIntersection = ComputeClipCodeIntersection_simd16();
1084 int validMask = primMask & _simd16_cmpeq_ps_mask(clipIntersection, _simd16_setzero_ps());
1085
1086 // skip clipping for points
1087 uint32_t clipMask = 0;
1088 if (NumVertsPerPrim != 1)
1089 {
1090 clipMask = primMask & ComputeClipMask_simd16();
1091 }
1092
1093 if (clipMask)
1094 {
1095 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
1096 // we have to clip tris, execute the clipper, which will also
1097 // call the binner
1098 ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
1099 AR_END(FEGuardbandClip, 1);
1100 }
1101 else if (validMask)
1102 {
1103 // update CPrimitives pipeline state
1104 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
1105
1106 // forward valid prims directly to binner
1107 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
1108 }
1109 }
1110
1111 #endif
1112 private:
1113 inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
1114 {
1115 return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
1116 }
1117
1118 #if USE_SIMD16_FRONTEND
1119 inline simd16scalar ComputeInterpFactor(simd16scalar boundaryCoord0, simd16scalar boundaryCoord1)
1120 {
1121 return _simd16_div_ps(boundaryCoord0, _simd16_sub_ps(boundaryCoord0, boundaryCoord1));
1122 }
1123
1124 #endif
1125 inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
1126 {
1127 const uint32_t simdVertexStride = sizeof(simdvertex);
1128 const uint32_t componentStride = sizeof(simdscalar);
1129 const uint32_t attribStride = sizeof(simdvector);
1130 const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
1131 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
1132
1133 // step to the simdvertex
1134 simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
1135
1136 // step to the attribute and component
1137 vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
1138
1139 // step to the lane
1140 vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
1141
1142 return vOffsets;
1143 }
1144
1145 #if USE_SIMD16_FRONTEND
1146 inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari vIndices, uint32_t component)
1147 {
1148 const uint32_t simdVertexStride = sizeof(simd16vertex);
1149 const uint32_t componentStride = sizeof(simd16scalar);
1150 const uint32_t attribStride = sizeof(simd16vector);
1151 const simd16scalari vElemOffset = _simd16_set_epi32(
1152 15 * sizeof(float), 14 * sizeof(float), 13 * sizeof(float), 12 * sizeof(float),
1153 11 * sizeof(float), 10 * sizeof(float), 9 * sizeof(float), 8 * sizeof(float),
1154 7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
1155 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
1156
1157 // step to the simdvertex
1158 simd16scalari vOffsets = _simd16_mullo_epi32(vIndices, _simd16_set1_epi32(simdVertexStride));
1159
1160 // step to the attribute and component
1161 vOffsets = _simd16_add_epi32(vOffsets, _simd16_set1_epi32(attribStride * attrib + componentStride * component));
1162
1163 // step to the lane
1164 vOffsets = _simd16_add_epi32(vOffsets, vElemOffset);
1165
1166 return vOffsets;
1167 }
1168
1169 #endif
1170 // gathers a single component for a given attribute for each SIMD lane
1171 inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
1172 {
1173 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1174 simdscalar vSrc = _mm256_undefined_ps();
1175 return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
1176 }
1177
1178 #if USE_SIMD16_FRONTEND
1179 inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component)
1180 {
1181 simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1182 simd16scalar vSrc = _simd16_setzero_ps();
1183 return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
1184 }
1185
1186 #endif
1187 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
1188 {
1189 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1190
1191 uint32_t* pOffsets = (uint32_t*)&vOffsets;
1192 float* pSrc = (float*)&vSrc;
1193 uint32_t mask = _simd_movemask_ps(vMask);
1194 DWORD lane;
1195 while (_BitScanForward(&lane, mask))
1196 {
1197 mask &= ~(1 << lane);
1198 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
1199 *(float*)pBuf = pSrc[lane];
1200 }
1201 }
1202
1203 #if USE_SIMD16_FRONTEND
1204 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component, simd16scalar vSrc)
1205 {
1206 simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1207
1208 uint32_t* pOffsets = (uint32_t*)&vOffsets;
1209 float* pSrc = (float*)&vSrc;
1210 uint32_t mask = _simd16_movemask_ps(vMask);
1211 DWORD lane;
1212 while (_BitScanForward(&lane, mask))
1213 {
1214 mask &= ~(1 << lane);
1215 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
1216 *(float*)pBuf = pSrc[lane];
1217 }
1218 }
1219
1220 #endif
1221 template<SWR_CLIPCODES ClippingPlane>
1222 inline void intersect(
1223 const simdscalar& vActiveMask, // active lanes to operate on
1224 const simdscalari& s, // index to first edge vertex v0 in pInPts.
1225 const simdscalari& p, // index to second edge vertex v1 in pInPts.
1226 const simdvector& v1, // vertex 0 position
1227 const simdvector& v2, // vertex 1 position
1228 simdscalari& outIndex, // output index.
1229 const float *pInVerts, // array of all the input positions.
1230 uint32_t numInAttribs, // number of attributes per vertex.
1231 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
1232 {
1233 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1234
1235 // compute interpolation factor
1236 simdscalar t;
1237 switch (ClippingPlane)
1238 {
1239 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
1240 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
1241 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
1242 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
1243 case FRUSTUM_NEAR:
1244 // DX Znear plane is 0, GL is -w
1245 if (this->state.rastState.clipHalfZ)
1246 {
1247 t = ComputeInterpFactor(v1[2], v2[2]);
1248 }
1249 else
1250 {
1251 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
1252 }
1253 break;
1254 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
1255 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1256 };
1257
1258 // interpolate position and store
1259 for (uint32_t c = 0; c < 4; ++c)
1260 {
1261 simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
1262 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
1263 }
1264
1265 // interpolate attributes and store
1266 for (uint32_t a = 0; a < numInAttribs; ++a)
1267 {
1268 uint32_t attribSlot = vertexAttribOffset + a;
1269 for (uint32_t c = 0; c < 4; ++c)
1270 {
1271 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1272 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1273 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1274 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1275 }
1276 }
1277
1278 // interpolate clip distance if enabled
1279 if (this->state.rastState.clipDistanceMask & 0xf)
1280 {
1281 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1282 for (uint32_t c = 0; c < 4; ++c)
1283 {
1284 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1285 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1286 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1287 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1288 }
1289 }
1290
1291 if (this->state.rastState.clipDistanceMask & 0xf0)
1292 {
1293 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1294 for (uint32_t c = 0; c < 4; ++c)
1295 {
1296 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1297 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1298 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1299 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1300 }
1301 }
1302 }
1303
1304 #if USE_SIMD16_FRONTEND
1305 template<SWR_CLIPCODES ClippingPlane>
1306 inline void intersect(
1307 const simd16scalar& vActiveMask,// active lanes to operate on
1308 const simd16scalari& s, // index to first edge vertex v0 in pInPts.
1309 const simd16scalari& p, // index to second edge vertex v1 in pInPts.
1310 const simd16vector& v1, // vertex 0 position
1311 const simd16vector& v2, // vertex 1 position
1312 simd16scalari& outIndex, // output index.
1313 const float *pInVerts, // array of all the input positions.
1314 uint32_t numInAttribs, // number of attributes per vertex.
1315 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
1316 {
1317 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1318
1319 // compute interpolation factor
1320 simd16scalar t;
1321 switch (ClippingPlane)
1322 {
1323 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[0]), _simd16_add_ps(v2[3], v2[0])); break;
1324 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[0]), _simd16_sub_ps(v2[3], v2[0])); break;
1325 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[1]), _simd16_add_ps(v2[3], v2[1])); break;
1326 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[1]), _simd16_sub_ps(v2[3], v2[1])); break;
1327 case FRUSTUM_NEAR:
1328 // DX Znear plane is 0, GL is -w
1329 if (this->state.rastState.clipHalfZ)
1330 {
1331 t = ComputeInterpFactor(v1[2], v2[2]);
1332 }
1333 else
1334 {
1335 t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[2]), _simd16_add_ps(v2[3], v2[2]));
1336 }
1337 break;
1338 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[2]), _simd16_sub_ps(v2[3], v2[2])); break;
1339 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1340 };
1341
1342 // interpolate position and store
1343 for (uint32_t c = 0; c < 4; ++c)
1344 {
1345 simd16scalar vOutPos = _simd16_fmadd_ps(_simd16_sub_ps(v2[c], v1[c]), t, v1[c]);
1346 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
1347 }
1348
1349 // interpolate attributes and store
1350 for (uint32_t a = 0; a < numInAttribs; ++a)
1351 {
1352 uint32_t attribSlot = vertexAttribOffset + a;
1353 for (uint32_t c = 0; c < 4; ++c)
1354 {
1355 simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1356 simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1357 simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1358 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1359 }
1360 }
1361
1362 // interpolate clip distance if enabled
1363 if (this->state.rastState.clipDistanceMask & 0xf)
1364 {
1365 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1366 for (uint32_t c = 0; c < 4; ++c)
1367 {
1368 simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1369 simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1370 simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1371 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1372 }
1373 }
1374
1375 if (this->state.rastState.clipDistanceMask & 0xf0)
1376 {
1377 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1378 for (uint32_t c = 0; c < 4; ++c)
1379 {
1380 simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1381 simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1382 simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1383 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1384 }
1385 }
1386 }
1387
1388 #endif
1389 template<SWR_CLIPCODES ClippingPlane>
1390 inline simdscalar inside(const simdvector& v)
1391 {
1392 switch (ClippingPlane)
1393 {
1394 case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
1395 case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
1396 case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
1397 case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
1398 case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
1399 case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
1400 default:
1401 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1402 return _simd_setzero_ps();
1403 }
1404 }
1405
1406 #if USE_SIMD16_FRONTEND
1407 template<SWR_CLIPCODES ClippingPlane>
1408 inline simd16scalar inside(const simd16vector& v)
1409 {
1410 switch (ClippingPlane)
1411 {
1412 case FRUSTUM_LEFT: return _simd16_cmpge_ps(v[0], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
1413 case FRUSTUM_RIGHT: return _simd16_cmple_ps(v[0], v[3]);
1414 case FRUSTUM_TOP: return _simd16_cmpge_ps(v[1], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
1415 case FRUSTUM_BOTTOM: return _simd16_cmple_ps(v[1], v[3]);
1416 case FRUSTUM_NEAR: return _simd16_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd16_setzero_ps() : _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
1417 case FRUSTUM_FAR: return _simd16_cmple_ps(v[2], v[3]);
1418 default:
1419 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1420 return _simd16_setzero_ps();
1421 }
1422 }
1423
1424 #endif
1425 template<SWR_CLIPCODES ClippingPlane>
1426 simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1427 {
1428 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1429
1430 simdscalari vCurIndex = _simd_setzero_si();
1431 simdscalari vOutIndex = _simd_setzero_si();
1432 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
1433
1434 while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1435 {
1436 simdscalari s = vCurIndex;
1437 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
1438 simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
1439 p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
1440
1441 // gather position
1442 simdvector vInPos0, vInPos1;
1443 for (uint32_t c = 0; c < 4; ++c)
1444 {
1445 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1446 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1447 }
1448
1449 // compute inside mask
1450 simdscalar s_in = inside<ClippingPlane>(vInPos0);
1451 simdscalar p_in = inside<ClippingPlane>(vInPos1);
1452
1453 // compute intersection mask (s_in != p_in)
1454 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
1455 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
1456
1457 // store s if inside
1458 s_in = _simd_and_ps(s_in, vActiveMask);
1459 if (!_simd_testz_ps(s_in, s_in))
1460 {
1461 // store position
1462 for (uint32_t c = 0; c < 4; ++c)
1463 {
1464 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1465 }
1466
1467 // store attribs
1468 for (uint32_t a = 0; a < numInAttribs; ++a)
1469 {
1470 uint32_t attribSlot = vertexAttribOffset + a;
1471 for (uint32_t c = 0; c < 4; ++c)
1472 {
1473 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1474 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1475 }
1476 }
1477
1478 // store clip distance if enabled
1479 if (this->state.rastState.clipDistanceMask & 0xf)
1480 {
1481 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1482 for (uint32_t c = 0; c < 4; ++c)
1483 {
1484 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1485 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1486 }
1487 }
1488
1489 if (this->state.rastState.clipDistanceMask & 0xf0)
1490 {
1491 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1492 for (uint32_t c = 0; c < 4; ++c)
1493 {
1494 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1495 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1496 }
1497 }
1498
1499 // increment outIndex
1500 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
1501 }
1502
1503 // compute and store intersection
1504 if (!_simd_testz_ps(intersectMask, intersectMask))
1505 {
1506 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1507
1508 // increment outIndex for active lanes
1509 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
1510 }
1511
1512 // increment loop index and update active mask
1513 vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
1514 vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
1515 }
1516
1517 return vOutIndex;
1518 }
1519
1520 #if USE_SIMD16_FRONTEND
1521 template<SWR_CLIPCODES ClippingPlane>
1522 simd16scalari ClipTriToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1523 {
1524 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1525
1526 simd16scalari vCurIndex = _simd16_setzero_si();
1527 simd16scalari vOutIndex = _simd16_setzero_si();
1528 simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
1529
1530 while (!_simd16_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1531 {
1532 simd16scalari s = vCurIndex;
1533 simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1));
1534 simd16scalari underFlowMask = _simd16_cmpgt_epi32(vNumInPts, p);
1535 p = _simd16_castps_si(_simd16_blendv_ps(_simd16_setzero_ps(), _simd16_castsi_ps(p), _simd16_castsi_ps(underFlowMask)));
1536
1537 // gather position
1538 simd16vector vInPos0, vInPos1;
1539 for (uint32_t c = 0; c < 4; ++c)
1540 {
1541 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1542 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1543 }
1544
1545 // compute inside mask
1546 simd16scalar s_in = inside<ClippingPlane>(vInPos0);
1547 simd16scalar p_in = inside<ClippingPlane>(vInPos1);
1548
1549 // compute intersection mask (s_in != p_in)
1550 simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in);
1551 intersectMask = _simd16_and_ps(intersectMask, vActiveMask);
1552
1553 // store s if inside
1554 s_in = _simd16_and_ps(s_in, vActiveMask);
1555 if (!_simd16_testz_ps(s_in, s_in))
1556 {
1557 // store position
1558 for (uint32_t c = 0; c < 4; ++c)
1559 {
1560 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1561 }
1562
1563 // store attribs
1564 for (uint32_t a = 0; a < numInAttribs; ++a)
1565 {
1566 uint32_t attribSlot = vertexAttribOffset + a;
1567 for (uint32_t c = 0; c < 4; ++c)
1568 {
1569 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1570 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1571 }
1572 }
1573
1574 // store clip distance if enabled
1575 if (this->state.rastState.clipDistanceMask & 0xf)
1576 {
1577 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1578 for (uint32_t c = 0; c < 4; ++c)
1579 {
1580 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1581 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1582 }
1583 }
1584
1585 if (this->state.rastState.clipDistanceMask & 0xf0)
1586 {
1587 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1588 for (uint32_t c = 0; c < 4; ++c)
1589 {
1590 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1591 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1592 }
1593 }
1594
1595 // increment outIndex
1596 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in);
1597 }
1598
1599 // compute and store intersection
1600 if (!_simd16_testz_ps(intersectMask, intersectMask))
1601 {
1602 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1603
1604 // increment outIndex for active lanes
1605 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask);
1606 }
1607
1608 // increment loop index and update active mask
1609 vCurIndex = _simd16_add_epi32(vCurIndex, _simd16_set1_epi32(1));
1610 vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
1611 }
1612
1613 return vOutIndex;
1614 }
1615
1616 #endif
1617 template<SWR_CLIPCODES ClippingPlane>
1618 simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1619 {
1620 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1621
1622 simdscalari vCurIndex = _simd_setzero_si();
1623 simdscalari vOutIndex = _simd_setzero_si();
1624 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
1625
1626 if (!_simd_testz_ps(vActiveMask, vActiveMask))
1627 {
1628 simdscalari s = vCurIndex;
1629 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
1630
1631 // gather position
1632 simdvector vInPos0, vInPos1;
1633 for (uint32_t c = 0; c < 4; ++c)
1634 {
1635 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1636 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1637 }
1638
1639 // compute inside mask
1640 simdscalar s_in = inside<ClippingPlane>(vInPos0);
1641 simdscalar p_in = inside<ClippingPlane>(vInPos1);
1642
1643 // compute intersection mask (s_in != p_in)
1644 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
1645 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
1646
1647 // store s if inside
1648 s_in = _simd_and_ps(s_in, vActiveMask);
1649 if (!_simd_testz_ps(s_in, s_in))
1650 {
1651 for (uint32_t c = 0; c < 4; ++c)
1652 {
1653 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1654 }
1655
1656 // interpolate attributes and store
1657 for (uint32_t a = 0; a < numInAttribs; ++a)
1658 {
1659 uint32_t attribSlot = vertexAttribOffset + a;
1660 for (uint32_t c = 0; c < 4; ++c)
1661 {
1662 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1663 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1664 }
1665 }
1666
1667 // increment outIndex
1668 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
1669 }
1670
1671 // compute and store intersection
1672 if (!_simd_testz_ps(intersectMask, intersectMask))
1673 {
1674 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1675
1676 // increment outIndex for active lanes
1677 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
1678 }
1679
1680 // store p if inside
1681 p_in = _simd_and_ps(p_in, vActiveMask);
1682 if (!_simd_testz_ps(p_in, p_in))
1683 {
1684 for (uint32_t c = 0; c < 4; ++c)
1685 {
1686 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1687 }
1688
1689 // interpolate attributes and store
1690 for (uint32_t a = 0; a < numInAttribs; ++a)
1691 {
1692 uint32_t attribSlot = vertexAttribOffset + a;
1693 for (uint32_t c = 0; c < 4; ++c)
1694 {
1695 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1696 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1697 }
1698 }
1699
1700 // increment outIndex
1701 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
1702 }
1703 }
1704
1705 return vOutIndex;
1706 }
1707
1708 #if USE_SIMD16_FRONTEND
1709 template<SWR_CLIPCODES ClippingPlane>
1710 simd16scalari ClipLineToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1711 {
1712 uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1713
1714 simd16scalari vCurIndex = _simd16_setzero_si();
1715 simd16scalari vOutIndex = _simd16_setzero_si();
1716 simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
1717
1718 if (!_simd16_testz_ps(vActiveMask, vActiveMask))
1719 {
1720 simd16scalari s = vCurIndex;
1721 simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1));
1722
1723 // gather position
1724 simd16vector vInPos0, vInPos1;
1725 for (uint32_t c = 0; c < 4; ++c)
1726 {
1727 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1728 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1729 }
1730
1731 // compute inside mask
1732 simd16scalar s_in = inside<ClippingPlane>(vInPos0);
1733 simd16scalar p_in = inside<ClippingPlane>(vInPos1);
1734
1735 // compute intersection mask (s_in != p_in)
1736 simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in);
1737 intersectMask = _simd16_and_ps(intersectMask, vActiveMask);
1738
1739 // store s if inside
1740 s_in = _simd16_and_ps(s_in, vActiveMask);
1741 if (!_simd16_testz_ps(s_in, s_in))
1742 {
1743 for (uint32_t c = 0; c < 4; ++c)
1744 {
1745 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1746 }
1747
1748 // interpolate attributes and store
1749 for (uint32_t a = 0; a < numInAttribs; ++a)
1750 {
1751 uint32_t attribSlot = vertexAttribOffset + a;
1752 for (uint32_t c = 0; c < 4; ++c)
1753 {
1754 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1755 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1756 }
1757 }
1758
1759 // increment outIndex
1760 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in);
1761 }
1762
1763 // compute and store intersection
1764 if (!_simd16_testz_ps(intersectMask, intersectMask))
1765 {
1766 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1767
1768 // increment outIndex for active lanes
1769 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask);
1770 }
1771
1772 // store p if inside
1773 p_in = _simd16_and_ps(p_in, vActiveMask);
1774 if (!_simd16_testz_ps(p_in, p_in))
1775 {
1776 for (uint32_t c = 0; c < 4; ++c)
1777 {
1778 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1779 }
1780
1781 // interpolate attributes and store
1782 for (uint32_t a = 0; a < numInAttribs; ++a)
1783 {
1784 uint32_t attribSlot = vertexAttribOffset + a;
1785 for (uint32_t c = 0; c < 4; ++c)
1786 {
1787 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1788 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1789 }
1790 }
1791
1792 // increment outIndex
1793 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), p_in);
1794 }
1795 }
1796
1797 return vOutIndex;
1798 }
1799 #endif
1800 //////////////////////////////////////////////////////////////////////////
1801 /// @brief Vertical clipper. Clips SIMD primitives at a time
1802 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
1803 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
1804 /// @param numAttribs - number of valid input attribs, including position
1805 simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
1806 {
1807 // temp storage
1808 float* pTempVerts = (float*)&tlsTempVertices[0];
1809
1810 // zero out num input verts for non-active lanes
1811 simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
1812 vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
1813
1814 // clip prims to frustum
1815 simdscalari vNumOutPts;
1816 if (NumVertsPerPrim == 3)
1817 {
1818 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1819 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1820 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1821 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1822 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1823 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1824 }
1825 else
1826 {
1827 SWR_ASSERT(NumVertsPerPrim == 2);
1828 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1829 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1830 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1831 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1832 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1833 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1834 }
1835
1836 // restore num verts for non-clipped, active lanes
1837 simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
1838 vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
1839
1840 return vNumOutPts;
1841 }
1842
1843 #if USE_SIMD16_FRONTEND
1844 simd16scalari ClipPrims(float* pVertices, const simd16scalar& vPrimMask, const simd16scalar& vClipMask, int numAttribs)
1845 {
1846 // temp storage
1847 float* pTempVerts = (float*)&tlsTempVertices_simd16[0];
1848
1849 // zero out num input verts for non-active lanes
1850 simd16scalari vNumInPts = _simd16_set1_epi32(NumVertsPerPrim);
1851 vNumInPts = _simd16_blendv_epi32(_simd16_setzero_si(), vNumInPts, vClipMask);
1852
1853 // clip prims to frustum
1854 simd16scalari vNumOutPts;
1855 if (NumVertsPerPrim == 3)
1856 {
1857 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1858 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1859 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1860 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1861 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1862 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1863 }
1864 else
1865 {
1866 SWR_ASSERT(NumVertsPerPrim == 2);
1867 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1868 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1869 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1870 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1871 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1872 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1873 }
1874
1875 // restore num verts for non-clipped, active lanes
1876 simd16scalar vNonClippedMask = _simd16_andnot_ps(vClipMask, vPrimMask);
1877 vNumOutPts = _simd16_blendv_epi32(vNumOutPts, _simd16_set1_epi32(NumVertsPerPrim), vNonClippedMask);
1878
1879 return vNumOutPts;
1880 }
1881
1882 #endif
1883 const uint32_t workerId{ 0 };
1884 DRAW_CONTEXT* pDC{ nullptr };
1885 const API_STATE& state;
1886 simdscalar clipCodes[NumVertsPerPrim];
1887 #if USE_SIMD16_FRONTEND
1888 simd16scalar clipCodes_simd16[NumVertsPerPrim];
1889 #endif
1890 };
1891
1892
1893 // pipeline stage functions
1894 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
1895 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
1896 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
1897 #if USE_SIMD16_FRONTEND
1898 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
1899 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
1900 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
1901 #endif
1902