swr/rast: code cleanup (no functional change)
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / clip.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD simd16vertex tlsTempVertices_simd16[7];
39 #endif
40
41 enum SWR_CLIPCODES
42 {
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
47 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
48 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
49 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
50
51 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
52 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
53
54 NEGW = (0x40 << CLIPCODE_SHIFT),
55
56 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
57 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
58 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
59 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
60 };
61
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63
64 INLINE
65 void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
66 {
67 clipCodes = _simd_setzero_ps();
68
69 // -w
70 simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
71
72 // FRUSTUM_LEFT
73 simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
74 clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
75
76 // FRUSTUM_TOP
77 vRes = _simd_cmplt_ps(vertex.y, vNegW);
78 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
79
80 // FRUSTUM_RIGHT
81 vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
82 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
83
84 // FRUSTUM_BOTTOM
85 vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
86 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
87
88 if (state.rastState.depthClipEnable)
89 {
90 // FRUSTUM_NEAR
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state.rastState.clipHalfZ)
93 {
94 vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
95 }
96 else
97 {
98 vRes = _simd_cmplt_ps(vertex.z, vNegW);
99 }
100 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
101
102 // FRUSTUM_FAR
103 vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
104 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
105 }
106
107 // NEGW
108 vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
109 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
110
111 // GUARDBAND_LEFT
112 simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
113 vRes = _simd_cmplt_ps(vertex.x, gbMult);
114 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
115
116 // GUARDBAND_TOP
117 gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
118 vRes = _simd_cmplt_ps(vertex.y, gbMult);
119 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
120
121 // GUARDBAND_RIGHT
122 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
123 vRes = _simd_cmpgt_ps(vertex.x, gbMult);
124 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
125
126 // GUARDBAND_BOTTOM
127 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
128 vRes = _simd_cmpgt_ps(vertex.y, gbMult);
129 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
130 }
131
132 #if USE_SIMD16_FRONTEND
133 INLINE
134 void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari viewportIndexes)
135 {
136 clipCodes = _simd16_setzero_ps();
137
138 // -w
139 simd16scalar vNegW = _simd16_mul_ps(vertex.w, _simd16_set1_ps(-1.0f));
140
141 // FRUSTUM_LEFT
142 simd16scalar vRes = _simd16_cmplt_ps(vertex.x, vNegW);
143 clipCodes = _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_LEFT)));
144
145 // FRUSTUM_TOP
146 vRes = _simd16_cmplt_ps(vertex.y, vNegW);
147 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_TOP))));
148
149 // FRUSTUM_RIGHT
150 vRes = _simd16_cmpgt_ps(vertex.x, vertex.w);
151 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_RIGHT))));
152
153 // FRUSTUM_BOTTOM
154 vRes = _simd16_cmpgt_ps(vertex.y, vertex.w);
155 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_BOTTOM))));
156
157 if (state.rastState.depthClipEnable)
158 {
159 // FRUSTUM_NEAR
160 // DX clips depth [0..w], GL clips [-w..w]
161 if (state.rastState.clipHalfZ)
162 {
163 vRes = _simd16_cmplt_ps(vertex.z, _simd16_setzero_ps());
164 }
165 else
166 {
167 vRes = _simd16_cmplt_ps(vertex.z, vNegW);
168 }
169 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_NEAR))));
170
171 // FRUSTUM_FAR
172 vRes = _simd16_cmpgt_ps(vertex.z, vertex.w);
173 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_FAR))));
174 }
175
176 // NEGW
177 vRes = _simd16_cmple_ps(vertex.w, _simd16_setzero_ps());
178 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(NEGW))));
179
180 // GUARDBAND_LEFT
181 simd16scalar gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
182 vRes = _simd16_cmplt_ps(vertex.x, gbMult);
183 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_LEFT))));
184
185 // GUARDBAND_TOP
186 gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
187 vRes = _simd16_cmplt_ps(vertex.y, gbMult);
188 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_TOP))));
189
190 // GUARDBAND_RIGHT
191 gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
192 vRes = _simd16_cmpgt_ps(vertex.x, gbMult);
193 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_RIGHT))));
194
195 // GUARDBAND_BOTTOM
196 gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
197 vRes = _simd16_cmpgt_ps(vertex.y, gbMult);
198 clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_BOTTOM))));
199 }
200
201 #endif
202 template<uint32_t NumVertsPerPrim>
203 class Clipper
204 {
205 public:
206 Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
207 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
208 {
209 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
210 }
211
212 void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
213 {
214 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
215 {
216 ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
217 }
218 }
219
220 #if USE_SIMD16_FRONTEND
221 void ComputeClipCodes(simd16vector vertex[], simd16scalari viewportIndexes)
222 {
223 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
224 {
225 ::ComputeClipCodes(this->state, vertex[i], this->clipCodes_simd16[i], viewportIndexes);
226 }
227 }
228
229 #endif
230 simdscalar ComputeClipCodeIntersection()
231 {
232 simdscalar result = this->clipCodes[0];
233 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
234 {
235 result = _simd_and_ps(result, this->clipCodes[i]);
236 }
237 return result;
238 }
239
240 #if USE_SIMD16_FRONTEND
241 simd16scalar ComputeClipCodeIntersection_simd16()
242 {
243 simd16scalar result = this->clipCodes_simd16[0];
244 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
245 {
246 result = _simd16_and_ps(result, this->clipCodes_simd16[i]);
247 }
248 return result;
249 }
250
251 #endif
252 simdscalar ComputeClipCodeUnion()
253 {
254 simdscalar result = this->clipCodes[0];
255 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
256 {
257 result = _simd_or_ps(result, this->clipCodes[i]);
258 }
259 return result;
260 }
261
262 #if USE_SIMD16_FRONTEND
263 simd16scalar ComputeClipCodeUnion_simd16()
264 {
265 simd16scalar result = this->clipCodes_simd16[0];
266 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
267 {
268 result = _simd16_or_ps(result, this->clipCodes_simd16[i]);
269 }
270 return result;
271 }
272
273 #endif
274 int ComputeNegWMask()
275 {
276 simdscalar clipCodeUnion = ComputeClipCodeUnion();
277 clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
278 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
279 }
280
281 int ComputeClipMask()
282 {
283 simdscalar clipUnion = ComputeClipCodeUnion();
284 clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
285 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
286 }
287
288 #if USE_SIMD16_FRONTEND
289 int ComputeClipMask_simd16()
290 {
291 simd16scalar clipUnion = ComputeClipCodeUnion_simd16();
292 clipUnion = _simd16_and_ps(clipUnion, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_CLIP_MASK)));
293 return _simd16_movemask_ps(_simd16_cmpneq_ps(clipUnion, _simd16_setzero_ps()));
294 }
295
296 #endif
297 // clipper is responsible for culling any prims with NAN coordinates
298 int ComputeNaNMask(simdvector prim[])
299 {
300 simdscalar vNanMask = _simd_setzero_ps();
301 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
302 {
303 simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
304 vNanMask = _simd_or_ps(vNanMask, vNan01);
305 simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
306 vNanMask = _simd_or_ps(vNanMask, vNan23);
307 }
308
309 return _simd_movemask_ps(vNanMask);
310 }
311
312 #if USE_SIMD16_FRONTEND
313 int ComputeNaNMask(simd16vector prim[])
314 {
315 simd16scalar vNanMask = _simd16_setzero_ps();
316 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
317 {
318 simd16scalar vNan01 = _simd16_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
319 vNanMask = _simd16_or_ps(vNanMask, vNan01);
320 simd16scalar vNan23 = _simd16_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
321 vNanMask = _simd16_or_ps(vNanMask, vNan23);
322 }
323
324 return _simd16_movemask_ps(vNanMask);
325 }
326
327 #endif
328 int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
329 {
330 uint8_t cullMask = this->state.rastState.cullDistanceMask;
331 simdscalar vClipCullMask = _simd_setzero_ps();
332 DWORD index;
333
334 simdvector vClipCullDistLo[3];
335 simdvector vClipCullDistHi[3];
336
337 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
338 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
339 while (_BitScanForward(&index, cullMask))
340 {
341 cullMask &= ~(1 << index);
342 uint32_t slot = index >> 2;
343 uint32_t component = index & 0x3;
344
345 simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
346 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
347 {
348 simdscalar vCullComp;
349 if (slot == 0)
350 {
351 vCullComp = vClipCullDistLo[e][component];
352 }
353 else
354 {
355 vCullComp = vClipCullDistHi[e][component];
356 }
357
358 // cull if cull distance < 0 || NAN
359 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
360 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
361 }
362 vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
363 }
364
365 // clipper should also discard any primitive with NAN clip distance
366 uint8_t clipMask = this->state.rastState.clipDistanceMask;
367 while (_BitScanForward(&index, clipMask))
368 {
369 clipMask &= ~(1 << index);
370 uint32_t slot = index >> 2;
371 uint32_t component = index & 0x3;
372
373 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
374 {
375 simdscalar vClipComp;
376 if (slot == 0)
377 {
378 vClipComp = vClipCullDistLo[e][component];
379 }
380 else
381 {
382 vClipComp = vClipCullDistHi[e][component];
383 }
384
385 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
386 vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
387 }
388 }
389
390 return _simd_movemask_ps(vClipCullMask);
391 }
392
393 #if USE_SIMD16_FRONTEND
394 int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[])
395 {
396 uint8_t cullMask = this->state.rastState.cullDistanceMask;
397 simd16scalar vClipCullMask = _simd16_setzero_ps();
398
399 simd16vector vClipCullDistLo[3];
400 simd16vector vClipCullDistHi[3];
401
402 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
403 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
404
405 DWORD index;
406 while (_BitScanForward(&index, cullMask))
407 {
408 cullMask &= ~(1 << index);
409 uint32_t slot = index >> 2;
410 uint32_t component = index & 0x3;
411
412 simd16scalar vCullMaskElem = _simd16_set1_ps(-1.0f);
413 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
414 {
415 simd16scalar vCullComp;
416 if (slot == 0)
417 {
418 vCullComp = vClipCullDistLo[e][component];
419 }
420 else
421 {
422 vCullComp = vClipCullDistHi[e][component];
423 }
424
425 // cull if cull distance < 0 || NAN
426 simd16scalar vCull = _simd16_cmp_ps(_simd16_setzero_ps(), vCullComp, _CMP_NLE_UQ);
427 vCullMaskElem = _simd16_and_ps(vCullMaskElem, vCull);
428 }
429 vClipCullMask = _simd16_or_ps(vClipCullMask, vCullMaskElem);
430 }
431
432 // clipper should also discard any primitive with NAN clip distance
433 uint8_t clipMask = this->state.rastState.clipDistanceMask;
434 while (_BitScanForward(&index, clipMask))
435 {
436 clipMask &= ~(1 << index);
437 uint32_t slot = index >> 2;
438 uint32_t component = index & 0x3;
439
440 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
441 {
442 simd16scalar vClipComp;
443 if (slot == 0)
444 {
445 vClipComp = vClipCullDistLo[e][component];
446 }
447 else
448 {
449 vClipComp = vClipCullDistHi[e][component];
450 }
451
452 simd16scalar vClip = _simd16_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
453 vClipCullMask = _simd16_or_ps(vClipCullMask, vClip);
454 }
455 }
456
457 return _simd16_movemask_ps(vClipCullMask);
458 }
459
460 #endif
461 // clip SIMD primitives
462 void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
463 {
464 // input/output vertex store for clipper
465 simdvertex vertices[7]; // maximum 7 verts generated per triangle
466
467 LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
468 uint32_t provokingVertex = 0;
469 if(pa.binTopology == TOP_TRIANGLE_FAN)
470 {
471 provokingVertex = this->state.frontendState.provokingVertex.triFan;
472 }
473 ///@todo: line topology for wireframe?
474
475 // assemble pos
476 simdvector tmpVector[NumVertsPerPrim];
477 pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
478 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
479 {
480 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
481 }
482
483 // assemble attribs
484 const SWR_BACKEND_STATE& backendState = this->state.backendState;
485
486 int32_t maxSlot = -1;
487 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
488 {
489 // Compute absolute attrib slot in vertex array
490 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
491 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
492 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
493
494 pa.Assemble(inputSlot, tmpVector);
495
496 // if constant interpolation enabled for this attribute, assign the provoking
497 // vertex values to all edges
498 if (_bittest(&constantInterpMask, slot))
499 {
500 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
501 {
502 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
503 }
504 }
505 else
506 {
507 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
508 {
509 vertices[i].attrib[inputSlot] = tmpVector[i];
510 }
511 }
512 }
513
514 // assemble user clip distances if enabled
515 if (this->state.rastState.clipDistanceMask & 0xf)
516 {
517 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
518 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
519 {
520 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
521 }
522 }
523
524 if (this->state.rastState.clipDistanceMask & 0xf0)
525 {
526 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
527 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
528 {
529 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
530 }
531 }
532
533 uint32_t numAttribs = maxSlot + 1;
534
535 simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
536
537 // set up new PA for binning clipped primitives
538 PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
539 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
540 if (NumVertsPerPrim == 3)
541 {
542 pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
543 clipTopology = TOP_TRIANGLE_FAN;
544
545 // so that the binner knows to bloat wide points later
546 if (pa.binTopology == TOP_POINT_LIST)
547 clipTopology = TOP_POINT_LIST;
548
549 }
550 else if (NumVertsPerPrim == 2)
551 {
552 pfnBinFunc = BinLines;
553 clipTopology = TOP_LINE_LIST;
554 }
555 else
556 {
557 SWR_ASSERT(0 && "Unexpected points in clipper.");
558 }
559
560 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
561 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
562 uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
563
564 const simdscalari vOffsets = _mm256_set_epi32(
565 0 * sizeof(simdvertex), // unused lane
566 6 * sizeof(simdvertex),
567 5 * sizeof(simdvertex),
568 4 * sizeof(simdvertex),
569 3 * sizeof(simdvertex),
570 2 * sizeof(simdvertex),
571 1 * sizeof(simdvertex),
572 0 * sizeof(simdvertex));
573
574 // only need to gather 7 verts
575 // @todo dynamic mask based on actual # of verts generated per lane
576 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
577
578 uint32_t numClippedPrims = 0;
579 #if USE_SIMD16_FRONTEND
580 const uint32_t numPrims = pa.NumPrims();
581 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
582
583 SWR_ASSERT(numPrims <= numPrims_lo);
584
585 for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim)
586 #else
587 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
588 #endif
589 {
590 uint32_t numEmittedVerts = pVertexCount[inputPrim];
591 if (numEmittedVerts < NumVertsPerPrim)
592 {
593 continue;
594 }
595 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
596
597 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
598 numClippedPrims += numEmittedPrims;
599
600 // tranpose clipper output so that each lane's vertices are in SIMD order
601 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
602 // for triangle fan
603 #if USE_SIMD16_FRONTEND
604 simd16vertex transposedPrims[2];
605 #else
606 simdvertex transposedPrims[2];
607 #endif
608
609 // transpose pos
610 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
611
612 #if USE_SIMD16_FRONTEND
613 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
614 static const float *dummy = reinterpret_cast<const float *>(pBase);
615 #endif
616
617 for (uint32_t c = 0; c < 4; ++c)
618 {
619 #if USE_SIMD16_FRONTEND
620 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
621 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
622 #else
623 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
624 #endif
625 pBase += sizeof(simdscalar);
626 }
627
628 // transpose attribs
629 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
630 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
631 {
632 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
633 for (uint32_t c = 0; c < 4; ++c)
634 {
635 #if USE_SIMD16_FRONTEND
636 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
637 transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
638 #else
639 transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
640 #endif
641 pBase += sizeof(simdscalar);
642 }
643 }
644
645 // transpose user clip distances if enabled
646 if (this->state.rastState.clipDistanceMask & 0xf)
647 {
648 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
649 for (uint32_t c = 0; c < 4; ++c)
650 {
651 #if USE_SIMD16_FRONTEND
652 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
653 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
654 #else
655 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
656 #endif
657 pBase += sizeof(simdscalar);
658 }
659 }
660
661 if (this->state.rastState.clipDistanceMask & 0xf0)
662 {
663 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
664 for (uint32_t c = 0; c < 4; ++c)
665 {
666 #if USE_SIMD16_FRONTEND
667 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
668 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
669 #else
670 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
671 #endif
672 pBase += sizeof(simdscalar);
673 }
674 }
675
676 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
677
678 while (clipPa.GetNextStreamOutput())
679 {
680 do
681 {
682 #if USE_SIMD16_FRONTEND
683 simd16vector attrib_simd16[NumVertsPerPrim];
684 bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
685
686 if (assemble)
687 {
688 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
689
690 simdvector attrib[NumVertsPerPrim];
691 for (uint32_t i = 0; i < NumVertsPerPrim; i += 1)
692 {
693 for (uint32_t j = 0; j < 4; j += 1)
694 {
695 attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
696 }
697 }
698
699 clipPa.useAlternateOffset = false;
700 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
701 }
702 #else
703 simdvector attrib[NumVertsPerPrim];
704 bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
705 if (assemble)
706 {
707 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
708 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
709 }
710 #endif
711 } while (clipPa.NextPrim());
712 }
713 }
714
715 // update global pipeline stat
716 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
717 }
718
719 #if USE_SIMD16_FRONTEND
720 void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId, const simd16scalari& vViewportIdx)
721 {
722 // input/output vertex store for clipper
723 simd16vertex vertices[7]; // maximum 7 verts generated per triangle
724
725 LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
726 uint32_t provokingVertex = 0;
727 if (pa.binTopology == TOP_TRIANGLE_FAN)
728 {
729 provokingVertex = this->state.frontendState.provokingVertex.triFan;
730 }
731 ///@todo: line topology for wireframe?
732
733 // assemble pos
734 simd16vector tmpVector[NumVertsPerPrim];
735 pa.Assemble_simd16(VERTEX_POSITION_SLOT, tmpVector);
736 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
737 {
738 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
739 }
740
741 // assemble attribs
742 const SWR_BACKEND_STATE& backendState = this->state.backendState;
743
744 int32_t maxSlot = -1;
745 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
746 {
747 // Compute absolute attrib slot in vertex array
748 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
749 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
750 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
751
752 pa.Assemble_simd16(inputSlot, tmpVector);
753
754 // if constant interpolation enabled for this attribute, assign the provoking
755 // vertex values to all edges
756 if (_bittest(&constantInterpMask, slot))
757 {
758 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
759 {
760 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
761 }
762 }
763 else
764 {
765 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
766 {
767 vertices[i].attrib[inputSlot] = tmpVector[i];
768 }
769 }
770 }
771
772 // assemble user clip distances if enabled
773 if (this->state.rastState.clipDistanceMask & 0xf)
774 {
775 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
776 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
777 {
778 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
779 }
780 }
781
782 if (this->state.rastState.clipDistanceMask & 0xf0)
783 {
784 pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
785 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
786 {
787 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
788 }
789 }
790
791 uint32_t numAttribs = maxSlot + 1;
792
793 simd16scalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
794
795 // set up new PA for binning clipped primitives
796 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc = nullptr;
797 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
798 if (NumVertsPerPrim == 3)
799 {
800 pfnBinFunc = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0));
801 clipTopology = TOP_TRIANGLE_FAN;
802
803 // so that the binner knows to bloat wide points later
804 if (pa.binTopology == TOP_POINT_LIST)
805 clipTopology = TOP_POINT_LIST;
806
807 }
808 else if (NumVertsPerPrim == 2)
809 {
810 pfnBinFunc = BinLines_simd16;
811 clipTopology = TOP_LINE_LIST;
812 }
813 else
814 {
815 SWR_ASSERT(0 && "Unexpected points in clipper.");
816 }
817
818 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
819 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
820 uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
821
822 const simdscalari vOffsets = _simd_set_epi32(
823 0 * sizeof(simd16vertex), // unused lane
824 6 * sizeof(simd16vertex),
825 5 * sizeof(simd16vertex),
826 4 * sizeof(simd16vertex),
827 3 * sizeof(simd16vertex),
828 2 * sizeof(simd16vertex),
829 1 * sizeof(simd16vertex),
830 0 * sizeof(simd16vertex));
831
832 // only need to gather 7 verts
833 // @todo dynamic mask based on actual # of verts generated per lane
834 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
835
836 uint32_t numClippedPrims = 0;
837
838 // tranpose clipper output so that each lane's vertices are in SIMD order
839 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
840 // for triangle fan
841
842 #if defined(_DEBUG)
843 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
844 simd16vertex *transposedPrims = reinterpret_cast<simd16vertex *>(malloc(sizeof(simd16vertex) * 2));
845
846 #else
847 simd16vertex transposedPrims[2];
848
849 #endif
850 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
851 {
852 uint32_t numEmittedVerts = pVertexCount[inputPrim];
853 if (numEmittedVerts < NumVertsPerPrim)
854 {
855 continue;
856 }
857 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
858
859 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
860 numClippedPrims += numEmittedPrims;
861
862 // tranpose clipper output so that each lane's vertices are in SIMD order
863 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
864 // for triangle fan
865
866 // transpose pos
867 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
868
869 #if 0
870 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
871 static const float *dummy = reinterpret_cast<const float *>(pBase);
872 #endif
873
874 for (uint32_t c = 0; c < 4; ++c)
875 {
876 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
877 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
878 pBase += sizeof(simd16scalar);
879 }
880
881 // transpose attribs
882 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
883 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
884 {
885 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
886 for (uint32_t c = 0; c < 4; ++c)
887 {
888 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
889 transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
890 pBase += sizeof(simd16scalar);
891 }
892 }
893
894 // transpose user clip distances if enabled
895 if (this->state.rastState.clipDistanceMask & 0xf)
896 {
897 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
898 for (uint32_t c = 0; c < 4; ++c)
899 {
900 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
901 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
902 pBase += sizeof(simd16scalar);
903 }
904 }
905
906 if (this->state.rastState.clipDistanceMask & 0xf0)
907 {
908 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
909 for (uint32_t c = 0; c < 4; ++c)
910 {
911 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
912 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
913 pBase += sizeof(simd16scalar);
914 }
915 }
916
917 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
918
919 while (clipPa.GetNextStreamOutput())
920 {
921 do
922 {
923 simd16vector attrib[NumVertsPerPrim];
924 bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib);
925
926 if (assemble)
927 {
928 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff };
929
930 clipPa.useAlternateOffset = false;
931 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]), _simd16_set1_epi32(pViewportIdx[inputPrim]));
932 }
933
934 } while (clipPa.NextPrim());
935 }
936 }
937
938 #if defined(_DEBUG)
939 free(transposedPrims);
940
941 #endif
942 // update global pipeline stat
943 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
944 }
945
946 #endif
947 // execute the clipper stage
948 void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
949 {
950 SWR_ASSERT(this->pDC != nullptr);
951 SWR_CONTEXT* pContext = this->pDC->pContext;
952 const API_STATE& apiState = this->pDC->pState->state;
953
954 // set up binner based on PA state
955 PFN_PROCESS_PRIMS pfnBinner;
956 switch (pa.binTopology)
957 {
958 case TOP_POINT_LIST:
959 pfnBinner = BinPoints;
960 break;
961 case TOP_LINE_LIST:
962 case TOP_LINE_STRIP:
963 case TOP_LINE_LOOP:
964 case TOP_LINE_LIST_ADJ:
965 case TOP_LISTSTRIP_ADJ:
966 pfnBinner = BinLines;
967 break;
968 default:
969 pfnBinner = GetBinTrianglesFunc((apiState.rastState.conservativeRast > 0));
970 break;
971 };
972
973 // update clipper invocations pipeline stat
974 uint32_t numInvoc = _mm_popcnt_u32(primMask);
975 UPDATE_STAT_FE(CInvocations, numInvoc);
976
977 ComputeClipCodes(prim, viewportIdx);
978
979 // cull prims with NAN coords
980 primMask &= ~ComputeNaNMask(prim);
981
982 // user cull distance cull
983 if (this->state.rastState.cullDistanceMask)
984 {
985 primMask &= ~ComputeUserClipCullMask(pa, prim);
986 }
987
988 // cull prims outside view frustum
989 simdscalar clipIntersection = ComputeClipCodeIntersection();
990 int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
991
992 // skip clipping for points
993 uint32_t clipMask = 0;
994 if (NumVertsPerPrim != 1)
995 {
996 clipMask = primMask & ComputeClipMask();
997 }
998
999 if (clipMask)
1000 {
1001 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
1002 // we have to clip tris, execute the clipper, which will also
1003 // call the binner
1004 ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
1005 AR_END(FEGuardbandClip, 1);
1006 }
1007 else if (validMask)
1008 {
1009 // update CPrimitives pipeline state
1010 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
1011
1012 // forward valid prims directly to binner
1013 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
1014 }
1015 }
1016
1017 #if USE_SIMD16_FRONTEND
1018 void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
1019 {
1020 SWR_ASSERT(pa.pDC != nullptr);
1021 SWR_CONTEXT* pContext = pa.pDC->pContext;
1022
1023 // set up binner based on PA state
1024 PFN_PROCESS_PRIMS_SIMD16 pfnBinner;
1025 switch (pa.binTopology)
1026 {
1027 case TOP_POINT_LIST:
1028 pfnBinner = BinPoints_simd16;
1029 break;
1030 case TOP_LINE_LIST:
1031 case TOP_LINE_STRIP:
1032 case TOP_LINE_LOOP:
1033 case TOP_LINE_LIST_ADJ:
1034 case TOP_LISTSTRIP_ADJ:
1035 pfnBinner = BinLines_simd16;
1036 break;
1037 default:
1038 pfnBinner = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0));
1039 break;
1040 };
1041
1042 // update clipper invocations pipeline stat
1043 uint32_t numInvoc = _mm_popcnt_u32(primMask);
1044 UPDATE_STAT_FE(CInvocations, numInvoc);
1045
1046 ComputeClipCodes(prim, viewportIdx);
1047
1048 // cull prims with NAN coords
1049 primMask &= ~ComputeNaNMask(prim);
1050
1051 // user cull distance cull
1052 if (this->state.rastState.cullDistanceMask)
1053 {
1054 primMask &= ~ComputeUserClipCullMask(pa, prim);
1055 }
1056
1057 // cull prims outside view frustum
1058 simd16scalar clipIntersection = ComputeClipCodeIntersection_simd16();
1059 int validMask = primMask & _simd16_movemask_ps(_simd16_cmpeq_ps(clipIntersection, _simd16_setzero_ps()));
1060
1061 // skip clipping for points
1062 uint32_t clipMask = 0;
1063 if (NumVertsPerPrim != 1)
1064 {
1065 clipMask = primMask & ComputeClipMask_simd16();
1066 }
1067
1068 if (clipMask)
1069 {
1070 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
1071 // we have to clip tris, execute the clipper, which will also
1072 // call the binner
1073 ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId, viewportIdx);
1074 AR_END(FEGuardbandClip, 1);
1075 }
1076 else if (validMask)
1077 {
1078 // update CPrimitives pipeline state
1079 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
1080
1081 // forward valid prims directly to binner
1082 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
1083 }
1084 }
1085
1086 #endif
1087 private:
1088 inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
1089 {
1090 return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
1091 }
1092
1093 #if USE_SIMD16_FRONTEND
1094 inline simd16scalar ComputeInterpFactor(simd16scalar boundaryCoord0, simd16scalar boundaryCoord1)
1095 {
1096 return _simd16_div_ps(boundaryCoord0, _simd16_sub_ps(boundaryCoord0, boundaryCoord1));
1097 }
1098
1099 #endif
1100 inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
1101 {
1102 const uint32_t simdVertexStride = sizeof(simdvertex);
1103 const uint32_t componentStride = sizeof(simdscalar);
1104 const uint32_t attribStride = sizeof(simdvector);
1105 const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
1106 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
1107
1108 // step to the simdvertex
1109 simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
1110
1111 // step to the attribute and component
1112 vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
1113
1114 // step to the lane
1115 vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
1116
1117 return vOffsets;
1118 }
1119
1120 #if USE_SIMD16_FRONTEND
1121 inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari vIndices, uint32_t component)
1122 {
1123 const uint32_t simdVertexStride = sizeof(simd16vertex);
1124 const uint32_t componentStride = sizeof(simd16scalar);
1125 const uint32_t attribStride = sizeof(simd16vector);
1126 const simd16scalari vElemOffset = _simd16_set_epi32(
1127 15 * sizeof(float), 14 * sizeof(float), 13 * sizeof(float), 12 * sizeof(float),
1128 11 * sizeof(float), 10 * sizeof(float), 9 * sizeof(float), 8 * sizeof(float),
1129 7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
1130 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
1131
1132 // step to the simdvertex
1133 simd16scalari vOffsets = _simd16_mullo_epi32(vIndices, _simd16_set1_epi32(simdVertexStride));
1134
1135 // step to the attribute and component
1136 vOffsets = _simd16_add_epi32(vOffsets, _simd16_set1_epi32(attribStride * attrib + componentStride * component));
1137
1138 // step to the lane
1139 vOffsets = _simd16_add_epi32(vOffsets, vElemOffset);
1140
1141 return vOffsets;
1142 }
1143
1144 #endif
1145 // gathers a single component for a given attribute for each SIMD lane
1146 inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
1147 {
1148 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1149 simdscalar vSrc = _mm256_undefined_ps();
1150 return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
1151 }
1152
1153 #if USE_SIMD16_FRONTEND
1154 inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component)
1155 {
1156 simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1157 simd16scalar vSrc = _simd16_setzero_ps();
1158 return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1);
1159 }
1160
1161 #endif
1162 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
1163 {
1164 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1165
1166 uint32_t* pOffsets = (uint32_t*)&vOffsets;
1167 float* pSrc = (float*)&vSrc;
1168 uint32_t mask = _simd_movemask_ps(vMask);
1169 DWORD lane;
1170 while (_BitScanForward(&lane, mask))
1171 {
1172 mask &= ~(1 << lane);
1173 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
1174 *(float*)pBuf = pSrc[lane];
1175 }
1176 }
1177
1178 #if USE_SIMD16_FRONTEND
1179 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component, simd16scalar vSrc)
1180 {
1181 simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
1182
1183 uint32_t* pOffsets = (uint32_t*)&vOffsets;
1184 float* pSrc = (float*)&vSrc;
1185 uint32_t mask = _simd16_movemask_ps(vMask);
1186 DWORD lane;
1187 while (_BitScanForward(&lane, mask))
1188 {
1189 mask &= ~(1 << lane);
1190 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
1191 *(float*)pBuf = pSrc[lane];
1192 }
1193 }
1194
1195 #endif
1196 template<SWR_CLIPCODES ClippingPlane>
1197 inline void intersect(
1198 const simdscalar& vActiveMask, // active lanes to operate on
1199 const simdscalari& s, // index to first edge vertex v0 in pInPts.
1200 const simdscalari& p, // index to second edge vertex v1 in pInPts.
1201 const simdvector& v1, // vertex 0 position
1202 const simdvector& v2, // vertex 1 position
1203 simdscalari& outIndex, // output index.
1204 const float *pInVerts, // array of all the input positions.
1205 uint32_t numInAttribs, // number of attributes per vertex.
1206 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
1207 {
1208 // compute interpolation factor
1209 simdscalar t;
1210 switch (ClippingPlane)
1211 {
1212 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
1213 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
1214 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
1215 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
1216 case FRUSTUM_NEAR:
1217 // DX Znear plane is 0, GL is -w
1218 if (this->state.rastState.clipHalfZ)
1219 {
1220 t = ComputeInterpFactor(v1[2], v2[2]);
1221 }
1222 else
1223 {
1224 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
1225 }
1226 break;
1227 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
1228 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1229 };
1230
1231 // interpolate position and store
1232 for (uint32_t c = 0; c < 4; ++c)
1233 {
1234 simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
1235 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
1236 }
1237
1238 // interpolate attributes and store
1239 for (uint32_t a = 0; a < numInAttribs; ++a)
1240 {
1241 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1242 for (uint32_t c = 0; c < 4; ++c)
1243 {
1244 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1245 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1246 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1247 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1248 }
1249 }
1250
1251 // interpolate clip distance if enabled
1252 if (this->state.rastState.clipDistanceMask & 0xf)
1253 {
1254 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1255 for (uint32_t c = 0; c < 4; ++c)
1256 {
1257 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1258 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1259 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1260 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1261 }
1262 }
1263
1264 if (this->state.rastState.clipDistanceMask & 0xf0)
1265 {
1266 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1267 for (uint32_t c = 0; c < 4; ++c)
1268 {
1269 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1270 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1271 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1272 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1273 }
1274 }
1275 }
1276
1277 #if USE_SIMD16_FRONTEND
1278 template<SWR_CLIPCODES ClippingPlane>
1279 inline void intersect(
1280 const simd16scalar& vActiveMask,// active lanes to operate on
1281 const simd16scalari& s, // index to first edge vertex v0 in pInPts.
1282 const simd16scalari& p, // index to second edge vertex v1 in pInPts.
1283 const simd16vector& v1, // vertex 0 position
1284 const simd16vector& v2, // vertex 1 position
1285 simd16scalari& outIndex, // output index.
1286 const float *pInVerts, // array of all the input positions.
1287 uint32_t numInAttribs, // number of attributes per vertex.
1288 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
1289 {
1290 // compute interpolation factor
1291 simd16scalar t;
1292 switch (ClippingPlane)
1293 {
1294 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[0]), _simd16_add_ps(v2[3], v2[0])); break;
1295 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[0]), _simd16_sub_ps(v2[3], v2[0])); break;
1296 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[1]), _simd16_add_ps(v2[3], v2[1])); break;
1297 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[1]), _simd16_sub_ps(v2[3], v2[1])); break;
1298 case FRUSTUM_NEAR:
1299 // DX Znear plane is 0, GL is -w
1300 if (this->state.rastState.clipHalfZ)
1301 {
1302 t = ComputeInterpFactor(v1[2], v2[2]);
1303 }
1304 else
1305 {
1306 t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[2]), _simd16_add_ps(v2[3], v2[2]));
1307 }
1308 break;
1309 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[2]), _simd16_sub_ps(v2[3], v2[2])); break;
1310 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1311 };
1312
1313 // interpolate position and store
1314 for (uint32_t c = 0; c < 4; ++c)
1315 {
1316 simd16scalar vOutPos = _simd16_fmadd_ps(_simd16_sub_ps(v2[c], v1[c]), t, v1[c]);
1317 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
1318 }
1319
1320 // interpolate attributes and store
1321 for (uint32_t a = 0; a < numInAttribs; ++a)
1322 {
1323 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1324 for (uint32_t c = 0; c < 4; ++c)
1325 {
1326 simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1327 simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1328 simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1329 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1330 }
1331 }
1332
1333 // interpolate clip distance if enabled
1334 if (this->state.rastState.clipDistanceMask & 0xf)
1335 {
1336 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1337 for (uint32_t c = 0; c < 4; ++c)
1338 {
1339 simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1340 simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1341 simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1342 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1343 }
1344 }
1345
1346 if (this->state.rastState.clipDistanceMask & 0xf0)
1347 {
1348 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1349 for (uint32_t c = 0; c < 4; ++c)
1350 {
1351 simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
1352 simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
1353 simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
1354 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
1355 }
1356 }
1357 }
1358
1359 #endif
1360 template<SWR_CLIPCODES ClippingPlane>
1361 inline simdscalar inside(const simdvector& v)
1362 {
1363 switch (ClippingPlane)
1364 {
1365 case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
1366 case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
1367 case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
1368 case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
1369 case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
1370 case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
1371 default:
1372 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1373 return _simd_setzero_ps();
1374 }
1375 }
1376
1377 #if USE_SIMD16_FRONTEND
1378 template<SWR_CLIPCODES ClippingPlane>
1379 inline simd16scalar inside(const simd16vector& v)
1380 {
1381 switch (ClippingPlane)
1382 {
1383 case FRUSTUM_LEFT: return _simd16_cmpge_ps(v[0], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
1384 case FRUSTUM_RIGHT: return _simd16_cmple_ps(v[0], v[3]);
1385 case FRUSTUM_TOP: return _simd16_cmpge_ps(v[1], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
1386 case FRUSTUM_BOTTOM: return _simd16_cmple_ps(v[1], v[3]);
1387 case FRUSTUM_NEAR: return _simd16_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd16_setzero_ps() : _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
1388 case FRUSTUM_FAR: return _simd16_cmple_ps(v[2], v[3]);
1389 default:
1390 SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1391 return _simd16_setzero_ps();
1392 }
1393 }
1394
1395 #endif
1396 template<SWR_CLIPCODES ClippingPlane>
1397 simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1398 {
1399 simdscalari vCurIndex = _simd_setzero_si();
1400 simdscalari vOutIndex = _simd_setzero_si();
1401 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
1402
1403 while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1404 {
1405 simdscalari s = vCurIndex;
1406 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
1407 simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
1408 p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
1409
1410 // gather position
1411 simdvector vInPos0, vInPos1;
1412 for (uint32_t c = 0; c < 4; ++c)
1413 {
1414 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1415 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1416 }
1417
1418 // compute inside mask
1419 simdscalar s_in = inside<ClippingPlane>(vInPos0);
1420 simdscalar p_in = inside<ClippingPlane>(vInPos1);
1421
1422 // compute intersection mask (s_in != p_in)
1423 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
1424 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
1425
1426 // store s if inside
1427 s_in = _simd_and_ps(s_in, vActiveMask);
1428 if (!_simd_testz_ps(s_in, s_in))
1429 {
1430 // store position
1431 for (uint32_t c = 0; c < 4; ++c)
1432 {
1433 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1434 }
1435
1436 // store attribs
1437 for (uint32_t a = 0; a < numInAttribs; ++a)
1438 {
1439 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1440 for (uint32_t c = 0; c < 4; ++c)
1441 {
1442 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1443 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1444 }
1445 }
1446
1447 // store clip distance if enabled
1448 if (this->state.rastState.clipDistanceMask & 0xf)
1449 {
1450 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1451 for (uint32_t c = 0; c < 4; ++c)
1452 {
1453 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1454 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1455 }
1456 }
1457
1458 if (this->state.rastState.clipDistanceMask & 0xf0)
1459 {
1460 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1461 for (uint32_t c = 0; c < 4; ++c)
1462 {
1463 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1464 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1465 }
1466 }
1467
1468 // increment outIndex
1469 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
1470 }
1471
1472 // compute and store intersection
1473 if (!_simd_testz_ps(intersectMask, intersectMask))
1474 {
1475 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1476
1477 // increment outIndex for active lanes
1478 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
1479 }
1480
1481 // increment loop index and update active mask
1482 vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
1483 vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
1484 }
1485
1486 return vOutIndex;
1487 }
1488
1489 #if USE_SIMD16_FRONTEND
1490 template<SWR_CLIPCODES ClippingPlane>
1491 simd16scalari ClipTriToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1492 {
1493 simd16scalari vCurIndex = _simd16_setzero_si();
1494 simd16scalari vOutIndex = _simd16_setzero_si();
1495 simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
1496
1497 while (!_simd16_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1498 {
1499 simd16scalari s = vCurIndex;
1500 simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1));
1501 simd16scalari underFlowMask = _simd16_cmpgt_epi32(vNumInPts, p);
1502 p = _simd16_castps_si(_simd16_blendv_ps(_simd16_setzero_ps(), _simd16_castsi_ps(p), _simd16_castsi_ps(underFlowMask)));
1503
1504 // gather position
1505 simd16vector vInPos0, vInPos1;
1506 for (uint32_t c = 0; c < 4; ++c)
1507 {
1508 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1509 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1510 }
1511
1512 // compute inside mask
1513 simd16scalar s_in = inside<ClippingPlane>(vInPos0);
1514 simd16scalar p_in = inside<ClippingPlane>(vInPos1);
1515
1516 // compute intersection mask (s_in != p_in)
1517 simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in);
1518 intersectMask = _simd16_and_ps(intersectMask, vActiveMask);
1519
1520 // store s if inside
1521 s_in = _simd16_and_ps(s_in, vActiveMask);
1522 if (!_simd16_testz_ps(s_in, s_in))
1523 {
1524 // store position
1525 for (uint32_t c = 0; c < 4; ++c)
1526 {
1527 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1528 }
1529
1530 // store attribs
1531 for (uint32_t a = 0; a < numInAttribs; ++a)
1532 {
1533 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1534 for (uint32_t c = 0; c < 4; ++c)
1535 {
1536 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1537 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1538 }
1539 }
1540
1541 // store clip distance if enabled
1542 if (this->state.rastState.clipDistanceMask & 0xf)
1543 {
1544 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
1545 for (uint32_t c = 0; c < 4; ++c)
1546 {
1547 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1548 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1549 }
1550 }
1551
1552 if (this->state.rastState.clipDistanceMask & 0xf0)
1553 {
1554 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
1555 for (uint32_t c = 0; c < 4; ++c)
1556 {
1557 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1558 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1559 }
1560 }
1561
1562 // increment outIndex
1563 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in);
1564 }
1565
1566 // compute and store intersection
1567 if (!_simd16_testz_ps(intersectMask, intersectMask))
1568 {
1569 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1570
1571 // increment outIndex for active lanes
1572 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask);
1573 }
1574
1575 // increment loop index and update active mask
1576 vCurIndex = _simd16_add_epi32(vCurIndex, _simd16_set1_epi32(1));
1577 vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
1578 }
1579
1580 return vOutIndex;
1581 }
1582
1583 #endif
1584 template<SWR_CLIPCODES ClippingPlane>
1585 simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1586 {
1587 simdscalari vCurIndex = _simd_setzero_si();
1588 simdscalari vOutIndex = _simd_setzero_si();
1589 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
1590
1591 if (!_simd_testz_ps(vActiveMask, vActiveMask))
1592 {
1593 simdscalari s = vCurIndex;
1594 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
1595
1596 // gather position
1597 simdvector vInPos0, vInPos1;
1598 for (uint32_t c = 0; c < 4; ++c)
1599 {
1600 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1601 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1602 }
1603
1604 // compute inside mask
1605 simdscalar s_in = inside<ClippingPlane>(vInPos0);
1606 simdscalar p_in = inside<ClippingPlane>(vInPos1);
1607
1608 // compute intersection mask (s_in != p_in)
1609 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
1610 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
1611
1612 // store s if inside
1613 s_in = _simd_and_ps(s_in, vActiveMask);
1614 if (!_simd_testz_ps(s_in, s_in))
1615 {
1616 for (uint32_t c = 0; c < 4; ++c)
1617 {
1618 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1619 }
1620
1621 // interpolate attributes and store
1622 for (uint32_t a = 0; a < numInAttribs; ++a)
1623 {
1624 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1625 for (uint32_t c = 0; c < 4; ++c)
1626 {
1627 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1628 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1629 }
1630 }
1631
1632 // increment outIndex
1633 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
1634 }
1635
1636 // compute and store intersection
1637 if (!_simd_testz_ps(intersectMask, intersectMask))
1638 {
1639 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1640
1641 // increment outIndex for active lanes
1642 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
1643 }
1644
1645 // store p if inside
1646 p_in = _simd_and_ps(p_in, vActiveMask);
1647 if (!_simd_testz_ps(p_in, p_in))
1648 {
1649 for (uint32_t c = 0; c < 4; ++c)
1650 {
1651 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1652 }
1653
1654 // interpolate attributes and store
1655 for (uint32_t a = 0; a < numInAttribs; ++a)
1656 {
1657 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1658 for (uint32_t c = 0; c < 4; ++c)
1659 {
1660 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1661 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1662 }
1663 }
1664
1665 // increment outIndex
1666 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
1667 }
1668 }
1669
1670 return vOutIndex;
1671 }
1672
1673 #if USE_SIMD16_FRONTEND
1674 template<SWR_CLIPCODES ClippingPlane>
1675 simd16scalari ClipLineToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
1676 {
1677 simd16scalari vCurIndex = _simd16_setzero_si();
1678 simd16scalari vOutIndex = _simd16_setzero_si();
1679 simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
1680
1681 if (!_simd16_testz_ps(vActiveMask, vActiveMask))
1682 {
1683 simd16scalari s = vCurIndex;
1684 simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1));
1685
1686 // gather position
1687 simd16vector vInPos0, vInPos1;
1688 for (uint32_t c = 0; c < 4; ++c)
1689 {
1690 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1691 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1692 }
1693
1694 // compute inside mask
1695 simd16scalar s_in = inside<ClippingPlane>(vInPos0);
1696 simd16scalar p_in = inside<ClippingPlane>(vInPos1);
1697
1698 // compute intersection mask (s_in != p_in)
1699 simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in);
1700 intersectMask = _simd16_and_ps(intersectMask, vActiveMask);
1701
1702 // store s if inside
1703 s_in = _simd16_and_ps(s_in, vActiveMask);
1704 if (!_simd16_testz_ps(s_in, s_in))
1705 {
1706 for (uint32_t c = 0; c < 4; ++c)
1707 {
1708 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1709 }
1710
1711 // interpolate attributes and store
1712 for (uint32_t a = 0; a < numInAttribs; ++a)
1713 {
1714 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1715 for (uint32_t c = 0; c < 4; ++c)
1716 {
1717 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1718 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1719 }
1720 }
1721
1722 // increment outIndex
1723 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in);
1724 }
1725
1726 // compute and store intersection
1727 if (!_simd16_testz_ps(intersectMask, intersectMask))
1728 {
1729 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1730
1731 // increment outIndex for active lanes
1732 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask);
1733 }
1734
1735 // store p if inside
1736 p_in = _simd16_and_ps(p_in, vActiveMask);
1737 if (!_simd16_testz_ps(p_in, p_in))
1738 {
1739 for (uint32_t c = 0; c < 4; ++c)
1740 {
1741 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1742 }
1743
1744 // interpolate attributes and store
1745 for (uint32_t a = 0; a < numInAttribs; ++a)
1746 {
1747 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
1748 for (uint32_t c = 0; c < 4; ++c)
1749 {
1750 simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1751 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1752 }
1753 }
1754
1755 // increment outIndex
1756 vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), p_in);
1757 }
1758 }
1759
1760 return vOutIndex;
1761 }
1762 #endif
1763 //////////////////////////////////////////////////////////////////////////
1764 /// @brief Vertical clipper. Clips SIMD primitives at a time
1765 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
1766 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
1767 /// @param numAttribs - number of valid input attribs, including position
1768 simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
1769 {
1770 // temp storage
1771 float* pTempVerts = (float*)&tlsTempVertices[0];
1772
1773 // zero out num input verts for non-active lanes
1774 simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
1775 vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
1776
1777 // clip prims to frustum
1778 simdscalari vNumOutPts;
1779 if (NumVertsPerPrim == 3)
1780 {
1781 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1782 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1783 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1784 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1785 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1786 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1787 }
1788 else
1789 {
1790 SWR_ASSERT(NumVertsPerPrim == 2);
1791 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1792 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1793 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1794 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1795 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1796 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1797 }
1798
1799 // restore num verts for non-clipped, active lanes
1800 simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
1801 vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
1802
1803 return vNumOutPts;
1804 }
1805
1806 #if USE_SIMD16_FRONTEND
1807 simd16scalari ClipPrims(float* pVertices, const simd16scalar& vPrimMask, const simd16scalar& vClipMask, int numAttribs)
1808 {
1809 // temp storage
1810 float* pTempVerts = (float*)&tlsTempVertices_simd16[0];
1811
1812 // zero out num input verts for non-active lanes
1813 simd16scalari vNumInPts = _simd16_set1_epi32(NumVertsPerPrim);
1814 vNumInPts = _simd16_blendv_epi32(_simd16_setzero_si(), vNumInPts, vClipMask);
1815
1816 // clip prims to frustum
1817 simd16scalari vNumOutPts;
1818 if (NumVertsPerPrim == 3)
1819 {
1820 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1821 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1822 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1823 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1824 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1825 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1826 }
1827 else
1828 {
1829 SWR_ASSERT(NumVertsPerPrim == 2);
1830 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1831 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1832 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1833 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1834 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1835 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1836 }
1837
1838 // restore num verts for non-clipped, active lanes
1839 simd16scalar vNonClippedMask = _simd16_andnot_ps(vClipMask, vPrimMask);
1840 vNumOutPts = _simd16_blendv_epi32(vNumOutPts, _simd16_set1_epi32(NumVertsPerPrim), vNonClippedMask);
1841
1842 return vNumOutPts;
1843 }
1844
1845 #endif
1846 const uint32_t workerId{ 0 };
1847 DRAW_CONTEXT* pDC{ nullptr };
1848 const API_STATE& state;
1849 simdscalar clipCodes[NumVertsPerPrim];
1850 #if USE_SIMD16_FRONTEND
1851 simd16scalar clipCodes_simd16[NumVertsPerPrim];
1852 #endif
1853 };
1854
1855
1856 // pipeline stage functions
1857 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
1858 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
1859 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
1860 #if USE_SIMD16_FRONTEND
1861 void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
1862 void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
1863 void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
1864 #endif
1865