1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for clipping
27 ******************************************************************************/
30 #include "common/simdintrin.h"
31 #include "core/context.h"
33 #include "rdtsc_core.h"
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices
[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD simd16vertex tlsTempVertices_simd16
[7];
43 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46 FRUSTUM_LEFT
= (0x01 << CLIPCODE_SHIFT
),
47 FRUSTUM_TOP
= (0x02 << CLIPCODE_SHIFT
),
48 FRUSTUM_RIGHT
= (0x04 << CLIPCODE_SHIFT
),
49 FRUSTUM_BOTTOM
= (0x08 << CLIPCODE_SHIFT
),
51 FRUSTUM_NEAR
= (0x10 << CLIPCODE_SHIFT
),
52 FRUSTUM_FAR
= (0x20 << CLIPCODE_SHIFT
),
54 NEGW
= (0x40 << CLIPCODE_SHIFT
),
56 GUARDBAND_LEFT
= (0x80 << CLIPCODE_SHIFT
| 0x1),
57 GUARDBAND_TOP
= (0x80 << CLIPCODE_SHIFT
| 0x2),
58 GUARDBAND_RIGHT
= (0x80 << CLIPCODE_SHIFT
| 0x4),
59 GUARDBAND_BOTTOM
= (0x80 << CLIPCODE_SHIFT
| 0x8)
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
65 void ComputeClipCodes(const API_STATE
& state
, const simdvector
& vertex
, simdscalar
& clipCodes
, simdscalari viewportIndexes
)
67 clipCodes
= _simd_setzero_ps();
70 simdscalar vNegW
= _simd_mul_ps(vertex
.w
, _simd_set1_ps(-1.0f
));
73 simdscalar vRes
= _simd_cmplt_ps(vertex
.x
, vNegW
);
74 clipCodes
= _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT
)));
77 vRes
= _simd_cmplt_ps(vertex
.y
, vNegW
);
78 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP
))));
81 vRes
= _simd_cmpgt_ps(vertex
.x
, vertex
.w
);
82 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT
))));
85 vRes
= _simd_cmpgt_ps(vertex
.y
, vertex
.w
);
86 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM
))));
88 if (state
.rastState
.depthClipEnable
)
91 // DX clips depth [0..w], GL clips [-w..w]
92 if (state
.rastState
.clipHalfZ
)
94 vRes
= _simd_cmplt_ps(vertex
.z
, _simd_setzero_ps());
98 vRes
= _simd_cmplt_ps(vertex
.z
, vNegW
);
100 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR
))));
103 vRes
= _simd_cmpgt_ps(vertex
.z
, vertex
.w
);
104 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR
))));
108 vRes
= _simd_cmple_ps(vertex
.w
, _simd_setzero_ps());
109 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(NEGW
))));
112 simdscalar gbMult
= _simd_mul_ps(vNegW
, _simd_i32gather_ps(&state
.gbState
.left
[0], viewportIndexes
, 4));
113 vRes
= _simd_cmplt_ps(vertex
.x
, gbMult
);
114 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT
))));
117 gbMult
= _simd_mul_ps(vNegW
, _simd_i32gather_ps(&state
.gbState
.top
[0], viewportIndexes
, 4));
118 vRes
= _simd_cmplt_ps(vertex
.y
, gbMult
);
119 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP
))));
122 gbMult
= _simd_mul_ps(vertex
.w
, _simd_i32gather_ps(&state
.gbState
.right
[0], viewportIndexes
, 4));
123 vRes
= _simd_cmpgt_ps(vertex
.x
, gbMult
);
124 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT
))));
127 gbMult
= _simd_mul_ps(vertex
.w
, _simd_i32gather_ps(&state
.gbState
.bottom
[0], viewportIndexes
, 4));
128 vRes
= _simd_cmpgt_ps(vertex
.y
, gbMult
);
129 clipCodes
= _simd_or_ps(clipCodes
, _simd_and_ps(vRes
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM
))));
132 #if USE_SIMD16_FRONTEND
134 void ComputeClipCodes(const API_STATE
& state
, const simd16vector
& vertex
, simd16scalar
& clipCodes
, simd16scalari viewportIndexes
)
136 clipCodes
= _simd16_setzero_ps();
139 simd16scalar vNegW
= _simd16_mul_ps(vertex
.w
, _simd16_set1_ps(-1.0f
));
142 simd16scalar vRes
= _simd16_cmplt_ps(vertex
.x
, vNegW
);
143 clipCodes
= _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_LEFT
)));
146 vRes
= _simd16_cmplt_ps(vertex
.y
, vNegW
);
147 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_TOP
))));
150 vRes
= _simd16_cmpgt_ps(vertex
.x
, vertex
.w
);
151 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_RIGHT
))));
154 vRes
= _simd16_cmpgt_ps(vertex
.y
, vertex
.w
);
155 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_BOTTOM
))));
157 if (state
.rastState
.depthClipEnable
)
160 // DX clips depth [0..w], GL clips [-w..w]
161 if (state
.rastState
.clipHalfZ
)
163 vRes
= _simd16_cmplt_ps(vertex
.z
, _simd16_setzero_ps());
167 vRes
= _simd16_cmplt_ps(vertex
.z
, vNegW
);
169 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_NEAR
))));
172 vRes
= _simd16_cmpgt_ps(vertex
.z
, vertex
.w
);
173 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_FAR
))));
177 vRes
= _simd16_cmple_ps(vertex
.w
, _simd16_setzero_ps());
178 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(NEGW
))));
181 simd16scalar gbMult
= _simd16_mul_ps(vNegW
, _simd16_i32gather_ps(&state
.gbState
.left
[0], viewportIndexes
, 4));
182 vRes
= _simd16_cmplt_ps(vertex
.x
, gbMult
);
183 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_LEFT
))));
186 gbMult
= _simd16_mul_ps(vNegW
, _simd16_i32gather_ps(&state
.gbState
.top
[0], viewportIndexes
, 4));
187 vRes
= _simd16_cmplt_ps(vertex
.y
, gbMult
);
188 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_TOP
))));
191 gbMult
= _simd16_mul_ps(vertex
.w
, _simd16_i32gather_ps(&state
.gbState
.right
[0], viewportIndexes
, 4));
192 vRes
= _simd16_cmpgt_ps(vertex
.x
, gbMult
);
193 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_RIGHT
))));
196 gbMult
= _simd16_mul_ps(vertex
.w
, _simd16_i32gather_ps(&state
.gbState
.bottom
[0], viewportIndexes
, 4));
197 vRes
= _simd16_cmpgt_ps(vertex
.y
, gbMult
);
198 clipCodes
= _simd16_or_ps(clipCodes
, _simd16_and_ps(vRes
, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_BOTTOM
))));
202 template<uint32_t NumVertsPerPrim
>
206 Clipper(uint32_t in_workerId
, DRAW_CONTEXT
* in_pDC
) :
207 workerId(in_workerId
), pDC(in_pDC
), state(GetApiState(in_pDC
))
209 static_assert(NumVertsPerPrim
>= 1 && NumVertsPerPrim
<= 3, "Invalid NumVertsPerPrim");
212 void ComputeClipCodes(simdvector vertex
[], simdscalari viewportIndexes
)
214 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
216 ::ComputeClipCodes(this->state
, vertex
[i
], this->clipCodes
[i
], viewportIndexes
);
220 #if USE_SIMD16_FRONTEND
221 void ComputeClipCodes(simd16vector vertex
[], simd16scalari viewportIndexes
)
223 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
225 ::ComputeClipCodes(this->state
, vertex
[i
], this->clipCodes_simd16
[i
], viewportIndexes
);
230 simdscalar
ComputeClipCodeIntersection()
232 simdscalar result
= this->clipCodes
[0];
233 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
235 result
= _simd_and_ps(result
, this->clipCodes
[i
]);
240 #if USE_SIMD16_FRONTEND
241 simd16scalar
ComputeClipCodeIntersection_simd16()
243 simd16scalar result
= this->clipCodes_simd16
[0];
244 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
246 result
= _simd16_and_ps(result
, this->clipCodes_simd16
[i
]);
252 simdscalar
ComputeClipCodeUnion()
254 simdscalar result
= this->clipCodes
[0];
255 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
257 result
= _simd_or_ps(result
, this->clipCodes
[i
]);
262 #if USE_SIMD16_FRONTEND
263 simd16scalar
ComputeClipCodeUnion_simd16()
265 simd16scalar result
= this->clipCodes_simd16
[0];
266 for (uint32_t i
= 1; i
< NumVertsPerPrim
; ++i
)
268 result
= _simd16_or_ps(result
, this->clipCodes_simd16
[i
]);
274 int ComputeNegWMask()
276 simdscalar clipCodeUnion
= ComputeClipCodeUnion();
277 clipCodeUnion
= _simd_and_ps(clipCodeUnion
, _simd_castsi_ps(_simd_set1_epi32(NEGW
)));
278 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion
, _simd_setzero_ps()));
281 int ComputeClipMask()
283 simdscalar clipUnion
= ComputeClipCodeUnion();
284 clipUnion
= _simd_and_ps(clipUnion
, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK
)));
285 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion
, _simd_setzero_ps()));
288 #if USE_SIMD16_FRONTEND
289 int ComputeClipMask_simd16()
291 simd16scalar clipUnion
= ComputeClipCodeUnion_simd16();
292 clipUnion
= _simd16_and_ps(clipUnion
, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_CLIP_MASK
)));
293 return _simd16_movemask_ps(_simd16_cmpneq_ps(clipUnion
, _simd16_setzero_ps()));
297 // clipper is responsible for culling any prims with NAN coordinates
298 int ComputeNaNMask(simdvector prim
[])
300 simdscalar vNanMask
= _simd_setzero_ps();
301 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
303 simdscalar vNan01
= _simd_cmp_ps(prim
[e
].v
[0], prim
[e
].v
[1], _CMP_UNORD_Q
);
304 vNanMask
= _simd_or_ps(vNanMask
, vNan01
);
305 simdscalar vNan23
= _simd_cmp_ps(prim
[e
].v
[2], prim
[e
].v
[3], _CMP_UNORD_Q
);
306 vNanMask
= _simd_or_ps(vNanMask
, vNan23
);
309 return _simd_movemask_ps(vNanMask
);
312 #if USE_SIMD16_FRONTEND
313 int ComputeNaNMask(simd16vector prim
[])
315 simd16scalar vNanMask
= _simd16_setzero_ps();
316 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
318 simd16scalar vNan01
= _simd16_cmp_ps(prim
[e
].v
[0], prim
[e
].v
[1], _CMP_UNORD_Q
);
319 vNanMask
= _simd16_or_ps(vNanMask
, vNan01
);
320 simd16scalar vNan23
= _simd16_cmp_ps(prim
[e
].v
[2], prim
[e
].v
[3], _CMP_UNORD_Q
);
321 vNanMask
= _simd16_or_ps(vNanMask
, vNan23
);
324 return _simd16_movemask_ps(vNanMask
);
328 int ComputeUserClipCullMask(PA_STATE
& pa
, simdvector prim
[])
330 uint8_t cullMask
= this->state
.rastState
.cullDistanceMask
;
331 simdscalar vClipCullMask
= _simd_setzero_ps();
334 simdvector vClipCullDistLo
[3];
335 simdvector vClipCullDistHi
[3];
337 pa
.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT
, vClipCullDistLo
);
338 pa
.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT
, vClipCullDistHi
);
339 while (_BitScanForward(&index
, cullMask
))
341 cullMask
&= ~(1 << index
);
342 uint32_t slot
= index
>> 2;
343 uint32_t component
= index
& 0x3;
345 simdscalar vCullMaskElem
= _simd_set1_ps(-1.0f
);
346 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
348 simdscalar vCullComp
;
351 vCullComp
= vClipCullDistLo
[e
][component
];
355 vCullComp
= vClipCullDistHi
[e
][component
];
358 // cull if cull distance < 0 || NAN
359 simdscalar vCull
= _simd_cmp_ps(_mm256_setzero_ps(), vCullComp
, _CMP_NLE_UQ
);
360 vCullMaskElem
= _simd_and_ps(vCullMaskElem
, vCull
);
362 vClipCullMask
= _simd_or_ps(vClipCullMask
, vCullMaskElem
);
365 // clipper should also discard any primitive with NAN clip distance
366 uint8_t clipMask
= this->state
.rastState
.clipDistanceMask
;
367 while (_BitScanForward(&index
, clipMask
))
369 clipMask
&= ~(1 << index
);
370 uint32_t slot
= index
>> 2;
371 uint32_t component
= index
& 0x3;
373 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
375 simdscalar vClipComp
;
378 vClipComp
= vClipCullDistLo
[e
][component
];
382 vClipComp
= vClipCullDistHi
[e
][component
];
385 simdscalar vClip
= _simd_cmp_ps(vClipComp
, vClipComp
, _CMP_UNORD_Q
);
386 vClipCullMask
= _simd_or_ps(vClipCullMask
, vClip
);
390 return _simd_movemask_ps(vClipCullMask
);
393 #if USE_SIMD16_FRONTEND
394 int ComputeUserClipCullMask(PA_STATE
& pa
, simd16vector prim
[])
396 uint8_t cullMask
= this->state
.rastState
.cullDistanceMask
;
397 simd16scalar vClipCullMask
= _simd16_setzero_ps();
399 simd16vector vClipCullDistLo
[3];
400 simd16vector vClipCullDistHi
[3];
402 pa
.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT
, vClipCullDistLo
);
403 pa
.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT
, vClipCullDistHi
);
406 while (_BitScanForward(&index
, cullMask
))
408 cullMask
&= ~(1 << index
);
409 uint32_t slot
= index
>> 2;
410 uint32_t component
= index
& 0x3;
412 simd16scalar vCullMaskElem
= _simd16_set1_ps(-1.0f
);
413 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
415 simd16scalar vCullComp
;
418 vCullComp
= vClipCullDistLo
[e
][component
];
422 vCullComp
= vClipCullDistHi
[e
][component
];
425 // cull if cull distance < 0 || NAN
426 simd16scalar vCull
= _simd16_cmp_ps(_simd16_setzero_ps(), vCullComp
, _CMP_NLE_UQ
);
427 vCullMaskElem
= _simd16_and_ps(vCullMaskElem
, vCull
);
429 vClipCullMask
= _simd16_or_ps(vClipCullMask
, vCullMaskElem
);
432 // clipper should also discard any primitive with NAN clip distance
433 uint8_t clipMask
= this->state
.rastState
.clipDistanceMask
;
434 while (_BitScanForward(&index
, clipMask
))
436 clipMask
&= ~(1 << index
);
437 uint32_t slot
= index
>> 2;
438 uint32_t component
= index
& 0x3;
440 for (uint32_t e
= 0; e
< NumVertsPerPrim
; ++e
)
442 simd16scalar vClipComp
;
445 vClipComp
= vClipCullDistLo
[e
][component
];
449 vClipComp
= vClipCullDistHi
[e
][component
];
452 simd16scalar vClip
= _simd16_cmp_ps(vClipComp
, vClipComp
, _CMP_UNORD_Q
);
453 vClipCullMask
= _simd16_or_ps(vClipCullMask
, vClip
);
457 return _simd16_movemask_ps(vClipCullMask
);
461 // clip SIMD primitives
462 void ClipSimd(const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, PA_STATE
& pa
, const simdscalari
& vPrimId
)
464 // input/output vertex store for clipper
465 simdvertex vertices
[7]; // maximum 7 verts generated per triangle
467 LONG constantInterpMask
= this->state
.backendState
.constantInterpolationMask
;
468 uint32_t provokingVertex
= 0;
469 if(pa
.binTopology
== TOP_TRIANGLE_FAN
)
471 provokingVertex
= this->state
.frontendState
.provokingVertex
.triFan
;
473 ///@todo: line topology for wireframe?
476 simdvector tmpVector
[NumVertsPerPrim
];
477 pa
.Assemble(VERTEX_POSITION_SLOT
, tmpVector
);
478 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
480 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = tmpVector
[i
];
484 const SWR_BACKEND_STATE
& backendState
= this->state
.backendState
;
486 int32_t maxSlot
= -1;
487 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
489 // Compute absolute attrib slot in vertex array
490 uint32_t mapSlot
= backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
491 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
492 uint32_t inputSlot
= backendState
.vertexAttribOffset
+ mapSlot
;
494 pa
.Assemble(inputSlot
, tmpVector
);
496 // if constant interpolation enabled for this attribute, assign the provoking
497 // vertex values to all edges
498 if (_bittest(&constantInterpMask
, slot
))
500 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
502 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
507 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
509 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
514 // assemble user clip distances if enabled
515 if (this->state
.rastState
.clipDistanceMask
& 0xf)
517 pa
.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT
, tmpVector
);
518 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
520 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
] = tmpVector
[i
];
524 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
526 pa
.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT
, tmpVector
);
527 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
529 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
] = tmpVector
[i
];
533 uint32_t numAttribs
= maxSlot
+ 1;
535 simdscalari vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
537 // set up new PA for binning clipped primitives
538 PFN_PROCESS_PRIMS pfnBinFunc
= nullptr;
539 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
540 if (NumVertsPerPrim
== 3)
542 pfnBinFunc
= GetBinTrianglesFunc((pa
.pDC
->pState
->state
.rastState
.conservativeRast
> 0));
543 clipTopology
= TOP_TRIANGLE_FAN
;
545 // so that the binner knows to bloat wide points later
546 if (pa
.binTopology
== TOP_POINT_LIST
)
547 clipTopology
= TOP_POINT_LIST
;
550 else if (NumVertsPerPrim
== 2)
552 pfnBinFunc
= BinLines
;
553 clipTopology
= TOP_LINE_LIST
;
557 SWR_ASSERT(0 && "Unexpected points in clipper.");
560 uint32_t* pVertexCount
= (uint32_t*)&vNumClippedVerts
;
561 uint32_t* pPrimitiveId
= (uint32_t*)&vPrimId
;
563 const simdscalari vOffsets
= _mm256_set_epi32(
564 0 * sizeof(simdvertex
), // unused lane
565 6 * sizeof(simdvertex
),
566 5 * sizeof(simdvertex
),
567 4 * sizeof(simdvertex
),
568 3 * sizeof(simdvertex
),
569 2 * sizeof(simdvertex
),
570 1 * sizeof(simdvertex
),
571 0 * sizeof(simdvertex
));
573 // only need to gather 7 verts
574 // @todo dynamic mask based on actual # of verts generated per lane
575 const simdscalar vMask
= _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
577 uint32_t numClippedPrims
= 0;
578 #if USE_SIMD16_FRONTEND
579 const uint32_t numPrims
= pa
.NumPrims();
580 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
582 SWR_ASSERT(numPrims
<= numPrims_lo
);
584 for (uint32_t inputPrim
= 0; inputPrim
< numPrims_lo
; ++inputPrim
)
586 for (uint32_t inputPrim
= 0; inputPrim
< pa
.NumPrims(); ++inputPrim
)
589 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
590 if (numEmittedVerts
< NumVertsPerPrim
)
594 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
596 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
597 numClippedPrims
+= numEmittedPrims
;
599 // tranpose clipper output so that each lane's vertices are in SIMD order
600 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
602 #if USE_SIMD16_FRONTEND
603 simd16vertex transposedPrims
[2];
605 simdvertex transposedPrims
[2];
609 uint8_t* pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
611 #if USE_SIMD16_FRONTEND
612 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
613 static const float *dummy
= reinterpret_cast<const float *>(pBase
);
616 for (uint32_t c
= 0; c
< 4; ++c
)
618 #if USE_SIMD16_FRONTEND
619 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
620 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
622 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
624 pBase
+= sizeof(simdscalar
);
628 pBase
= (uint8_t*)(&vertices
[0].attrib
[backendState
.vertexAttribOffset
]) + sizeof(float) * inputPrim
;
629 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
631 uint32_t attribSlot
= backendState
.vertexAttribOffset
+ attrib
;
632 for (uint32_t c
= 0; c
< 4; ++c
)
634 #if USE_SIMD16_FRONTEND
635 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
636 transposedPrims
[0].attrib
[attribSlot
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
638 transposedPrims
[0].attrib
[attribSlot
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
640 pBase
+= sizeof(simdscalar
);
644 // transpose user clip distances if enabled
645 if (this->state
.rastState
.clipDistanceMask
& 0xf)
647 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
]) + sizeof(float) * inputPrim
;
648 for (uint32_t c
= 0; c
< 4; ++c
)
650 #if USE_SIMD16_FRONTEND
651 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
652 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
654 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
656 pBase
+= sizeof(simdscalar
);
660 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
662 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
]) + sizeof(float) * inputPrim
;
663 for (uint32_t c
= 0; c
< 4; ++c
)
665 #if USE_SIMD16_FRONTEND
666 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
667 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
669 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
][c
] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase
, vOffsets
, vMask
, 1);
671 pBase
+= sizeof(simdscalar
);
675 PA_STATE_OPT
clipPa(this->pDC
, numEmittedPrims
, (uint8_t*)&transposedPrims
[0], numEmittedVerts
, SWR_VTX_NUM_SLOTS
, true, clipTopology
);
677 while (clipPa
.GetNextStreamOutput())
681 #if USE_SIMD16_FRONTEND
682 simd16vector attrib_simd16
[NumVertsPerPrim
];
683 bool assemble
= clipPa
.Assemble_simd16(VERTEX_POSITION_SLOT
, attrib_simd16
);
687 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
689 simdvector attrib
[NumVertsPerPrim
];
690 for (uint32_t i
= 0; i
< NumVertsPerPrim
; i
+= 1)
692 for (uint32_t j
= 0; j
< 4; j
+= 1)
694 attrib
[i
][j
] = _simd16_extract_ps(attrib_simd16
[i
][j
], 0);
698 clipPa
.useAlternateOffset
= false;
699 pfnBinFunc(this->pDC
, clipPa
, this->workerId
, attrib
, primMaskMap
[numEmittedPrims
], _simd_set1_epi32(pPrimitiveId
[inputPrim
]));
702 simdvector attrib
[NumVertsPerPrim
];
703 bool assemble
= clipPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
706 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
707 pfnBinFunc(this->pDC
, clipPa
, this->workerId
, attrib
, primMaskMap
[numEmittedPrims
], _simd_set1_epi32(pPrimitiveId
[inputPrim
]));
710 } while (clipPa
.NextPrim());
714 // update global pipeline stat
715 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
718 #if USE_SIMD16_FRONTEND
719 void ClipSimd(const simd16scalar
& vPrimMask
, const simd16scalar
& vClipMask
, PA_STATE
& pa
, const simd16scalari
& vPrimId
)
721 // input/output vertex store for clipper
722 simd16vertex vertices
[7]; // maximum 7 verts generated per triangle
724 LONG constantInterpMask
= this->state
.backendState
.constantInterpolationMask
;
725 uint32_t provokingVertex
= 0;
726 if (pa
.binTopology
== TOP_TRIANGLE_FAN
)
728 provokingVertex
= this->state
.frontendState
.provokingVertex
.triFan
;
730 ///@todo: line topology for wireframe?
733 simd16vector tmpVector
[NumVertsPerPrim
];
734 pa
.Assemble_simd16(VERTEX_POSITION_SLOT
, tmpVector
);
735 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
737 vertices
[i
].attrib
[VERTEX_POSITION_SLOT
] = tmpVector
[i
];
741 const SWR_BACKEND_STATE
& backendState
= this->state
.backendState
;
743 int32_t maxSlot
= -1;
744 for (uint32_t slot
= 0; slot
< backendState
.numAttributes
; ++slot
)
746 // Compute absolute attrib slot in vertex array
747 uint32_t mapSlot
= backendState
.swizzleEnable
? backendState
.swizzleMap
[slot
].sourceAttrib
: slot
;
748 maxSlot
= std::max
<int32_t>(maxSlot
, mapSlot
);
749 uint32_t inputSlot
= backendState
.vertexAttribOffset
+ mapSlot
;
751 pa
.Assemble_simd16(inputSlot
, tmpVector
);
753 // if constant interpolation enabled for this attribute, assign the provoking
754 // vertex values to all edges
755 if (_bittest(&constantInterpMask
, slot
))
757 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
759 vertices
[i
].attrib
[inputSlot
] = tmpVector
[provokingVertex
];
764 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
766 vertices
[i
].attrib
[inputSlot
] = tmpVector
[i
];
771 // assemble user clip distances if enabled
772 if (this->state
.rastState
.clipDistanceMask
& 0xf)
774 pa
.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT
, tmpVector
);
775 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
777 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
] = tmpVector
[i
];
781 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
783 pa
.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT
, tmpVector
);
784 for (uint32_t i
= 0; i
< NumVertsPerPrim
; ++i
)
786 vertices
[i
].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
] = tmpVector
[i
];
790 uint32_t numAttribs
= maxSlot
+ 1;
792 simd16scalari vNumClippedVerts
= ClipPrims((float*)&vertices
[0], vPrimMask
, vClipMask
, numAttribs
);
794 // set up new PA for binning clipped primitives
795 PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc
= nullptr;
796 PRIMITIVE_TOPOLOGY clipTopology
= TOP_UNKNOWN
;
797 if (NumVertsPerPrim
== 3)
799 pfnBinFunc
= GetBinTrianglesFunc_simd16((pa
.pDC
->pState
->state
.rastState
.conservativeRast
> 0));
800 clipTopology
= TOP_TRIANGLE_FAN
;
802 // so that the binner knows to bloat wide points later
803 if (pa
.binTopology
== TOP_POINT_LIST
)
804 clipTopology
= TOP_POINT_LIST
;
807 else if (NumVertsPerPrim
== 2)
809 pfnBinFunc
= BinLines_simd16
;
810 clipTopology
= TOP_LINE_LIST
;
814 SWR_ASSERT(0 && "Unexpected points in clipper.");
817 uint32_t* pVertexCount
= (uint32_t*)&vNumClippedVerts
;
818 uint32_t* pPrimitiveId
= (uint32_t*)&vPrimId
;
820 const simdscalari vOffsets
= _simd_set_epi32(
821 0 * sizeof(simd16vertex
), // unused lane
822 6 * sizeof(simd16vertex
),
823 5 * sizeof(simd16vertex
),
824 4 * sizeof(simd16vertex
),
825 3 * sizeof(simd16vertex
),
826 2 * sizeof(simd16vertex
),
827 1 * sizeof(simd16vertex
),
828 0 * sizeof(simd16vertex
));
830 // only need to gather 7 verts
831 // @todo dynamic mask based on actual # of verts generated per lane
832 const simdscalar vMask
= _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
834 uint32_t numClippedPrims
= 0;
836 // tranpose clipper output so that each lane's vertices are in SIMD order
837 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
841 // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
842 simd16vertex
*transposedPrims
= reinterpret_cast<simd16vertex
*>(malloc(sizeof(simd16vertex
) * 2));
845 simd16vertex transposedPrims
[2];
848 for (uint32_t inputPrim
= 0; inputPrim
< pa
.NumPrims(); ++inputPrim
)
850 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
851 if (numEmittedVerts
< NumVertsPerPrim
)
855 SWR_ASSERT(numEmittedVerts
<= 7, "Unexpected vertex count from clipper.");
857 uint32_t numEmittedPrims
= GetNumPrims(clipTopology
, numEmittedVerts
);
858 numClippedPrims
+= numEmittedPrims
;
860 // tranpose clipper output so that each lane's vertices are in SIMD order
861 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
865 uint8_t* pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_POSITION_SLOT
]) + sizeof(float) * inputPrim
;
868 // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
869 static const float *dummy
= reinterpret_cast<const float *>(pBase
);
872 for (uint32_t c
= 0; c
< 4; ++c
)
874 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
875 transposedPrims
[0].attrib
[VERTEX_POSITION_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
876 pBase
+= sizeof(simd16scalar
);
880 pBase
= (uint8_t*)(&vertices
[0].attrib
[backendState
.vertexAttribOffset
]) + sizeof(float) * inputPrim
;
881 for (uint32_t attrib
= 0; attrib
< numAttribs
; ++attrib
)
883 uint32_t attribSlot
= backendState
.vertexAttribOffset
+ attrib
;
884 for (uint32_t c
= 0; c
< 4; ++c
)
886 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
887 transposedPrims
[0].attrib
[attribSlot
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
888 pBase
+= sizeof(simd16scalar
);
892 // transpose user clip distances if enabled
893 if (this->state
.rastState
.clipDistanceMask
& 0xf)
895 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
]) + sizeof(float) * inputPrim
;
896 for (uint32_t c
= 0; c
< 4; ++c
)
898 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
899 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_LO_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
900 pBase
+= sizeof(simd16scalar
);
904 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
906 pBase
= (uint8_t*)(&vertices
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
]) + sizeof(float) * inputPrim
;
907 for (uint32_t c
= 0; c
< 4; ++c
)
909 simdscalar temp
= _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase
, vOffsets
, vMask
, 1);
910 transposedPrims
[0].attrib
[VERTEX_CLIPCULL_DIST_HI_SLOT
][c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
911 pBase
+= sizeof(simd16scalar
);
915 PA_STATE_OPT
clipPa(this->pDC
, numEmittedPrims
, (uint8_t*)&transposedPrims
[0], numEmittedVerts
, SWR_VTX_NUM_SLOTS
, true, clipTopology
);
917 while (clipPa
.GetNextStreamOutput())
921 simd16vector attrib
[NumVertsPerPrim
];
922 bool assemble
= clipPa
.Assemble_simd16(VERTEX_POSITION_SLOT
, attrib
);
926 static const uint32_t primMaskMap
[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff };
928 clipPa
.useAlternateOffset
= false;
929 pfnBinFunc(this->pDC
, clipPa
, this->workerId
, attrib
, primMaskMap
[numEmittedPrims
], _simd16_set1_epi32(pPrimitiveId
[inputPrim
]));
932 } while (clipPa
.NextPrim());
937 free(transposedPrims
);
940 // update global pipeline stat
941 UPDATE_STAT_FE(CPrimitives
, numClippedPrims
);
945 // execute the clipper stage
946 void ExecuteStage(PA_STATE
& pa
, simdvector prim
[], uint32_t primMask
, simdscalari primId
)
948 SWR_ASSERT(this->pDC
!= nullptr);
949 SWR_CONTEXT
* pContext
= this->pDC
->pContext
;
950 const API_STATE
& apiState
= this->pDC
->pState
->state
;
952 // set up binner based on PA state
953 PFN_PROCESS_PRIMS pfnBinner
;
954 switch (pa
.binTopology
)
957 pfnBinner
= BinPoints
;
962 case TOP_LINE_LIST_ADJ
:
963 case TOP_LISTSTRIP_ADJ
:
964 pfnBinner
= BinLines
;
967 pfnBinner
= GetBinTrianglesFunc((apiState
.rastState
.conservativeRast
> 0));
971 // update clipper invocations pipeline stat
972 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
973 UPDATE_STAT_FE(CInvocations
, numInvoc
);
975 // Read back viewport index if required
976 simdscalari viewportIdx
= _simd_set1_epi32(0);
977 if (state
.backendState
.readViewportArrayIndex
)
979 simdvector vpiAttrib
[NumVertsPerPrim
];
980 pa
.Assemble(VERTEX_SGV_SLOT
, vpiAttrib
);
981 simdscalari vpai
= _simd_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
983 // OOB indices => forced to zero.
984 simdscalari vNumViewports
= _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
985 simdscalari vClearMask
= _simd_cmplt_epi32(vpai
, vNumViewports
);
986 viewportIdx
= _simd_and_si(vClearMask
, vpai
);
989 ComputeClipCodes(prim
, viewportIdx
);
991 // cull prims with NAN coords
992 primMask
&= ~ComputeNaNMask(prim
);
994 // user cull distance cull
995 if (this->state
.rastState
.cullDistanceMask
)
997 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
1000 // cull prims outside view frustum
1001 simdscalar clipIntersection
= ComputeClipCodeIntersection();
1002 int validMask
= primMask
& _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection
, _simd_setzero_ps()));
1004 // skip clipping for points
1005 uint32_t clipMask
= 0;
1006 if (NumVertsPerPrim
!= 1)
1008 clipMask
= primMask
& ComputeClipMask();
1013 AR_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
1014 // we have to clip tris, execute the clipper, which will also
1016 ClipSimd(vMask(primMask
), vMask(clipMask
), pa
, primId
);
1017 AR_END(FEGuardbandClip
, 1);
1021 // update CPrimitives pipeline state
1022 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
1024 // forward valid prims directly to binner
1025 pfnBinner(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
);
1029 #if USE_SIMD16_FRONTEND
1030 void ExecuteStage(PA_STATE
& pa
, simd16vector prim
[], uint32_t primMask
, simd16scalari primId
)
1032 SWR_ASSERT(pa
.pDC
!= nullptr);
1033 SWR_CONTEXT
* pContext
= pa
.pDC
->pContext
;
1035 // set up binner based on PA state
1036 PFN_PROCESS_PRIMS_SIMD16 pfnBinner
;
1037 switch (pa
.binTopology
)
1039 case TOP_POINT_LIST
:
1040 pfnBinner
= BinPoints_simd16
;
1043 case TOP_LINE_STRIP
:
1045 case TOP_LINE_LIST_ADJ
:
1046 case TOP_LISTSTRIP_ADJ
:
1047 pfnBinner
= BinLines_simd16
;
1050 pfnBinner
= GetBinTrianglesFunc_simd16((pa
.pDC
->pState
->state
.rastState
.conservativeRast
> 0));
1054 // update clipper invocations pipeline stat
1055 uint32_t numInvoc
= _mm_popcnt_u32(primMask
);
1056 UPDATE_STAT_FE(CInvocations
, numInvoc
);
1058 // Read back viewport index if required
1059 simd16scalari viewportIdx
= _simd16_set1_epi32(0);
1060 if (state
.backendState
.readViewportArrayIndex
)
1062 simd16vector vpiAttrib
[NumVertsPerPrim
];
1063 pa
.Assemble_simd16(VERTEX_SGV_SLOT
, vpiAttrib
);
1065 // OOB indices => forced to zero.
1066 simd16scalari vpai
= _simd16_castps_si(vpiAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1067 simd16scalari vNumViewports
= _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1068 simd16scalari vClearMask
= _simd16_cmplt_epi32(vpai
, vNumViewports
);
1069 viewportIdx
= _simd16_and_si(vClearMask
, vpai
);
1071 ComputeClipCodes(prim
, viewportIdx
);
1073 // cull prims with NAN coords
1074 primMask
&= ~ComputeNaNMask(prim
);
1076 // user cull distance cull
1077 if (this->state
.rastState
.cullDistanceMask
)
1079 primMask
&= ~ComputeUserClipCullMask(pa
, prim
);
1082 // cull prims outside view frustum
1083 simd16scalar clipIntersection
= ComputeClipCodeIntersection_simd16();
1084 int validMask
= primMask
& _simd16_movemask_ps(_simd16_cmpeq_ps(clipIntersection
, _simd16_setzero_ps()));
1086 // skip clipping for points
1087 uint32_t clipMask
= 0;
1088 if (NumVertsPerPrim
!= 1)
1090 clipMask
= primMask
& ComputeClipMask_simd16();
1095 AR_BEGIN(FEGuardbandClip
, pa
.pDC
->drawId
);
1096 // we have to clip tris, execute the clipper, which will also
1098 ClipSimd(vMask(primMask
), vMask(clipMask
), pa
, primId
);
1099 AR_END(FEGuardbandClip
, 1);
1103 // update CPrimitives pipeline state
1104 UPDATE_STAT_FE(CPrimitives
, _mm_popcnt_u32(validMask
));
1106 // forward valid prims directly to binner
1107 pfnBinner(this->pDC
, pa
, this->workerId
, prim
, validMask
, primId
);
1113 inline simdscalar
ComputeInterpFactor(simdscalar boundaryCoord0
, simdscalar boundaryCoord1
)
1115 return _simd_div_ps(boundaryCoord0
, _simd_sub_ps(boundaryCoord0
, boundaryCoord1
));
1118 #if USE_SIMD16_FRONTEND
1119 inline simd16scalar
ComputeInterpFactor(simd16scalar boundaryCoord0
, simd16scalar boundaryCoord1
)
1121 return _simd16_div_ps(boundaryCoord0
, _simd16_sub_ps(boundaryCoord0
, boundaryCoord1
));
1125 inline simdscalari
ComputeOffsets(uint32_t attrib
, simdscalari vIndices
, uint32_t component
)
1127 const uint32_t simdVertexStride
= sizeof(simdvertex
);
1128 const uint32_t componentStride
= sizeof(simdscalar
);
1129 const uint32_t attribStride
= sizeof(simdvector
);
1130 const __m256i vElemOffset
= _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
1131 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
1133 // step to the simdvertex
1134 simdscalari vOffsets
= _simd_mullo_epi32(vIndices
, _simd_set1_epi32(simdVertexStride
));
1136 // step to the attribute and component
1137 vOffsets
= _simd_add_epi32(vOffsets
, _simd_set1_epi32(attribStride
* attrib
+ componentStride
* component
));
1140 vOffsets
= _simd_add_epi32(vOffsets
, vElemOffset
);
1145 #if USE_SIMD16_FRONTEND
1146 inline simd16scalari
ComputeOffsets(uint32_t attrib
, simd16scalari vIndices
, uint32_t component
)
1148 const uint32_t simdVertexStride
= sizeof(simd16vertex
);
1149 const uint32_t componentStride
= sizeof(simd16scalar
);
1150 const uint32_t attribStride
= sizeof(simd16vector
);
1151 const simd16scalari vElemOffset
= _simd16_set_epi32(
1152 15 * sizeof(float), 14 * sizeof(float), 13 * sizeof(float), 12 * sizeof(float),
1153 11 * sizeof(float), 10 * sizeof(float), 9 * sizeof(float), 8 * sizeof(float),
1154 7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
1155 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
1157 // step to the simdvertex
1158 simd16scalari vOffsets
= _simd16_mullo_epi32(vIndices
, _simd16_set1_epi32(simdVertexStride
));
1160 // step to the attribute and component
1161 vOffsets
= _simd16_add_epi32(vOffsets
, _simd16_set1_epi32(attribStride
* attrib
+ componentStride
* component
));
1164 vOffsets
= _simd16_add_epi32(vOffsets
, vElemOffset
);
1170 // gathers a single component for a given attribute for each SIMD lane
1171 inline simdscalar
GatherComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
)
1173 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
1174 simdscalar vSrc
= _mm256_undefined_ps();
1175 return _simd_mask_i32gather_ps(vSrc
, pBuffer
, vOffsets
, vMask
, 1);
1178 #if USE_SIMD16_FRONTEND
1179 inline simd16scalar
GatherComponent(const float* pBuffer
, uint32_t attrib
, simd16scalar vMask
, simd16scalari vIndices
, uint32_t component
)
1181 simd16scalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
1182 simd16scalar vSrc
= _simd16_setzero_ps();
1183 return _simd16_mask_i32gather_ps(vSrc
, pBuffer
, vOffsets
, vMask
, 1);
1187 inline void ScatterComponent(const float* pBuffer
, uint32_t attrib
, simdscalar vMask
, simdscalari vIndices
, uint32_t component
, simdscalar vSrc
)
1189 simdscalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
1191 uint32_t* pOffsets
= (uint32_t*)&vOffsets
;
1192 float* pSrc
= (float*)&vSrc
;
1193 uint32_t mask
= _simd_movemask_ps(vMask
);
1195 while (_BitScanForward(&lane
, mask
))
1197 mask
&= ~(1 << lane
);
1198 uint8_t* pBuf
= (uint8_t*)pBuffer
+ pOffsets
[lane
];
1199 *(float*)pBuf
= pSrc
[lane
];
1203 #if USE_SIMD16_FRONTEND
1204 inline void ScatterComponent(const float* pBuffer
, uint32_t attrib
, simd16scalar vMask
, simd16scalari vIndices
, uint32_t component
, simd16scalar vSrc
)
1206 simd16scalari vOffsets
= ComputeOffsets(attrib
, vIndices
, component
);
1208 uint32_t* pOffsets
= (uint32_t*)&vOffsets
;
1209 float* pSrc
= (float*)&vSrc
;
1210 uint32_t mask
= _simd16_movemask_ps(vMask
);
1212 while (_BitScanForward(&lane
, mask
))
1214 mask
&= ~(1 << lane
);
1215 uint8_t* pBuf
= (uint8_t*)pBuffer
+ pOffsets
[lane
];
1216 *(float*)pBuf
= pSrc
[lane
];
1221 template<SWR_CLIPCODES ClippingPlane
>
1222 inline void intersect(
1223 const simdscalar
& vActiveMask
, // active lanes to operate on
1224 const simdscalari
& s
, // index to first edge vertex v0 in pInPts.
1225 const simdscalari
& p
, // index to second edge vertex v1 in pInPts.
1226 const simdvector
& v1
, // vertex 0 position
1227 const simdvector
& v2
, // vertex 1 position
1228 simdscalari
& outIndex
, // output index.
1229 const float *pInVerts
, // array of all the input positions.
1230 uint32_t numInAttribs
, // number of attributes per vertex.
1231 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
1233 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1235 // compute interpolation factor
1237 switch (ClippingPlane
)
1239 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[0]), _simd_add_ps(v2
[3], v2
[0])); break;
1240 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[0]), _simd_sub_ps(v2
[3], v2
[0])); break;
1241 case FRUSTUM_TOP
: t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[1]), _simd_add_ps(v2
[3], v2
[1])); break;
1242 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[1]), _simd_sub_ps(v2
[3], v2
[1])); break;
1244 // DX Znear plane is 0, GL is -w
1245 if (this->state
.rastState
.clipHalfZ
)
1247 t
= ComputeInterpFactor(v1
[2], v2
[2]);
1251 t
= ComputeInterpFactor(_simd_add_ps(v1
[3], v1
[2]), _simd_add_ps(v2
[3], v2
[2]));
1254 case FRUSTUM_FAR
: t
= ComputeInterpFactor(_simd_sub_ps(v1
[3], v1
[2]), _simd_sub_ps(v2
[3], v2
[2])); break;
1255 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
1258 // interpolate position and store
1259 for (uint32_t c
= 0; c
< 4; ++c
)
1261 simdscalar vOutPos
= _simd_fmadd_ps(_simd_sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
1262 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
1265 // interpolate attributes and store
1266 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1268 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1269 for (uint32_t c
= 0; c
< 4; ++c
)
1271 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1272 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1273 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1274 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1278 // interpolate clip distance if enabled
1279 if (this->state
.rastState
.clipDistanceMask
& 0xf)
1281 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
1282 for (uint32_t c
= 0; c
< 4; ++c
)
1284 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1285 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1286 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1287 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1291 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
1293 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
1294 for (uint32_t c
= 0; c
< 4; ++c
)
1296 simdscalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1297 simdscalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1298 simdscalar vOutAttrib
= _simd_fmadd_ps(_simd_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1299 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1304 #if USE_SIMD16_FRONTEND
1305 template<SWR_CLIPCODES ClippingPlane
>
1306 inline void intersect(
1307 const simd16scalar
& vActiveMask
,// active lanes to operate on
1308 const simd16scalari
& s
, // index to first edge vertex v0 in pInPts.
1309 const simd16scalari
& p
, // index to second edge vertex v1 in pInPts.
1310 const simd16vector
& v1
, // vertex 0 position
1311 const simd16vector
& v2
, // vertex 1 position
1312 simd16scalari
& outIndex
, // output index.
1313 const float *pInVerts
, // array of all the input positions.
1314 uint32_t numInAttribs
, // number of attributes per vertex.
1315 float *pOutVerts
) // array of output positions. We'll write our new intersection point at i*4.
1317 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1319 // compute interpolation factor
1321 switch (ClippingPlane
)
1323 case FRUSTUM_LEFT
: t
= ComputeInterpFactor(_simd16_add_ps(v1
[3], v1
[0]), _simd16_add_ps(v2
[3], v2
[0])); break;
1324 case FRUSTUM_RIGHT
: t
= ComputeInterpFactor(_simd16_sub_ps(v1
[3], v1
[0]), _simd16_sub_ps(v2
[3], v2
[0])); break;
1325 case FRUSTUM_TOP
: t
= ComputeInterpFactor(_simd16_add_ps(v1
[3], v1
[1]), _simd16_add_ps(v2
[3], v2
[1])); break;
1326 case FRUSTUM_BOTTOM
: t
= ComputeInterpFactor(_simd16_sub_ps(v1
[3], v1
[1]), _simd16_sub_ps(v2
[3], v2
[1])); break;
1328 // DX Znear plane is 0, GL is -w
1329 if (this->state
.rastState
.clipHalfZ
)
1331 t
= ComputeInterpFactor(v1
[2], v2
[2]);
1335 t
= ComputeInterpFactor(_simd16_add_ps(v1
[3], v1
[2]), _simd16_add_ps(v2
[3], v2
[2]));
1338 case FRUSTUM_FAR
: t
= ComputeInterpFactor(_simd16_sub_ps(v1
[3], v1
[2]), _simd16_sub_ps(v2
[3], v2
[2])); break;
1339 default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
1342 // interpolate position and store
1343 for (uint32_t c
= 0; c
< 4; ++c
)
1345 simd16scalar vOutPos
= _simd16_fmadd_ps(_simd16_sub_ps(v2
[c
], v1
[c
]), t
, v1
[c
]);
1346 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, outIndex
, c
, vOutPos
);
1349 // interpolate attributes and store
1350 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1352 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1353 for (uint32_t c
= 0; c
< 4; ++c
)
1355 simd16scalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1356 simd16scalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1357 simd16scalar vOutAttrib
= _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1358 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1362 // interpolate clip distance if enabled
1363 if (this->state
.rastState
.clipDistanceMask
& 0xf)
1365 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
1366 for (uint32_t c
= 0; c
< 4; ++c
)
1368 simd16scalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1369 simd16scalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1370 simd16scalar vOutAttrib
= _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1371 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1375 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
1377 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
1378 for (uint32_t c
= 0; c
< 4; ++c
)
1380 simd16scalar vAttrib0
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, s
, c
);
1381 simd16scalar vAttrib1
= GatherComponent(pInVerts
, attribSlot
, vActiveMask
, p
, c
);
1382 simd16scalar vOutAttrib
= _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1
, vAttrib0
), t
, vAttrib0
);
1383 ScatterComponent(pOutVerts
, attribSlot
, vActiveMask
, outIndex
, c
, vOutAttrib
);
1389 template<SWR_CLIPCODES ClippingPlane
>
1390 inline simdscalar
inside(const simdvector
& v
)
1392 switch (ClippingPlane
)
1394 case FRUSTUM_LEFT
: return _simd_cmpge_ps(v
[0], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
1395 case FRUSTUM_RIGHT
: return _simd_cmple_ps(v
[0], v
[3]);
1396 case FRUSTUM_TOP
: return _simd_cmpge_ps(v
[1], _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
1397 case FRUSTUM_BOTTOM
: return _simd_cmple_ps(v
[1], v
[3]);
1398 case FRUSTUM_NEAR
: return _simd_cmpge_ps(v
[2], this->state
.rastState
.clipHalfZ
? _simd_setzero_ps() : _simd_mul_ps(v
[3], _simd_set1_ps(-1.0f
)));
1399 case FRUSTUM_FAR
: return _simd_cmple_ps(v
[2], v
[3]);
1401 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
1402 return _simd_setzero_ps();
1406 #if USE_SIMD16_FRONTEND
1407 template<SWR_CLIPCODES ClippingPlane
>
1408 inline simd16scalar
inside(const simd16vector
& v
)
1410 switch (ClippingPlane
)
1412 case FRUSTUM_LEFT
: return _simd16_cmpge_ps(v
[0], _simd16_mul_ps(v
[3], _simd16_set1_ps(-1.0f
)));
1413 case FRUSTUM_RIGHT
: return _simd16_cmple_ps(v
[0], v
[3]);
1414 case FRUSTUM_TOP
: return _simd16_cmpge_ps(v
[1], _simd16_mul_ps(v
[3], _simd16_set1_ps(-1.0f
)));
1415 case FRUSTUM_BOTTOM
: return _simd16_cmple_ps(v
[1], v
[3]);
1416 case FRUSTUM_NEAR
: return _simd16_cmpge_ps(v
[2], this->state
.rastState
.clipHalfZ
? _simd16_setzero_ps() : _simd16_mul_ps(v
[3], _simd16_set1_ps(-1.0f
)));
1417 case FRUSTUM_FAR
: return _simd16_cmple_ps(v
[2], v
[3]);
1419 SWR_INVALID("invalid clipping plane: %d", ClippingPlane
);
1420 return _simd16_setzero_ps();
1425 template<SWR_CLIPCODES ClippingPlane
>
1426 simdscalari
ClipTriToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
1428 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1430 simdscalari vCurIndex
= _simd_setzero_si();
1431 simdscalari vOutIndex
= _simd_setzero_si();
1432 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
1434 while (!_simd_testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
1436 simdscalari s
= vCurIndex
;
1437 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
1438 simdscalari underFlowMask
= _simd_cmpgt_epi32(vNumInPts
, p
);
1439 p
= _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p
), _simd_castsi_ps(underFlowMask
)));
1442 simdvector vInPos0
, vInPos1
;
1443 for (uint32_t c
= 0; c
< 4; ++c
)
1445 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1446 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1449 // compute inside mask
1450 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
1451 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
1453 // compute intersection mask (s_in != p_in)
1454 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
1455 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
1457 // store s if inside
1458 s_in
= _simd_and_ps(s_in
, vActiveMask
);
1459 if (!_simd_testz_ps(s_in
, s_in
))
1462 for (uint32_t c
= 0; c
< 4; ++c
)
1464 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1468 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1470 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1471 for (uint32_t c
= 0; c
< 4; ++c
)
1473 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1474 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1478 // store clip distance if enabled
1479 if (this->state
.rastState
.clipDistanceMask
& 0xf)
1481 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
1482 for (uint32_t c
= 0; c
< 4; ++c
)
1484 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1485 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1489 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
1491 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
1492 for (uint32_t c
= 0; c
< 4; ++c
)
1494 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1495 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1499 // increment outIndex
1500 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
1503 // compute and store intersection
1504 if (!_simd_testz_ps(intersectMask
, intersectMask
))
1506 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1508 // increment outIndex for active lanes
1509 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
1512 // increment loop index and update active mask
1513 vCurIndex
= _simd_add_epi32(vCurIndex
, _simd_set1_epi32(1));
1514 vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
1520 #if USE_SIMD16_FRONTEND
1521 template<SWR_CLIPCODES ClippingPlane
>
1522 simd16scalari
ClipTriToPlane(const float* pInVerts
, const simd16scalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
1524 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1526 simd16scalari vCurIndex
= _simd16_setzero_si();
1527 simd16scalari vOutIndex
= _simd16_setzero_si();
1528 simd16scalar vActiveMask
= _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex
, vNumInPts
));
1530 while (!_simd16_testz_ps(vActiveMask
, vActiveMask
)) // loop until activeMask is empty
1532 simd16scalari s
= vCurIndex
;
1533 simd16scalari p
= _simd16_add_epi32(s
, _simd16_set1_epi32(1));
1534 simd16scalari underFlowMask
= _simd16_cmpgt_epi32(vNumInPts
, p
);
1535 p
= _simd16_castps_si(_simd16_blendv_ps(_simd16_setzero_ps(), _simd16_castsi_ps(p
), _simd16_castsi_ps(underFlowMask
)));
1538 simd16vector vInPos0
, vInPos1
;
1539 for (uint32_t c
= 0; c
< 4; ++c
)
1541 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1542 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1545 // compute inside mask
1546 simd16scalar s_in
= inside
<ClippingPlane
>(vInPos0
);
1547 simd16scalar p_in
= inside
<ClippingPlane
>(vInPos1
);
1549 // compute intersection mask (s_in != p_in)
1550 simd16scalar intersectMask
= _simd16_xor_ps(s_in
, p_in
);
1551 intersectMask
= _simd16_and_ps(intersectMask
, vActiveMask
);
1553 // store s if inside
1554 s_in
= _simd16_and_ps(s_in
, vActiveMask
);
1555 if (!_simd16_testz_ps(s_in
, s_in
))
1558 for (uint32_t c
= 0; c
< 4; ++c
)
1560 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1564 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1566 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1567 for (uint32_t c
= 0; c
< 4; ++c
)
1569 simd16scalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1570 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1574 // store clip distance if enabled
1575 if (this->state
.rastState
.clipDistanceMask
& 0xf)
1577 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_LO_SLOT
;
1578 for (uint32_t c
= 0; c
< 4; ++c
)
1580 simd16scalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1581 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1585 if (this->state
.rastState
.clipDistanceMask
& 0xf0)
1587 uint32_t attribSlot
= VERTEX_CLIPCULL_DIST_HI_SLOT
;
1588 for (uint32_t c
= 0; c
< 4; ++c
)
1590 simd16scalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1591 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1595 // increment outIndex
1596 vOutIndex
= _simd16_blendv_epi32(vOutIndex
, _simd16_add_epi32(vOutIndex
, _simd16_set1_epi32(1)), s_in
);
1599 // compute and store intersection
1600 if (!_simd16_testz_ps(intersectMask
, intersectMask
))
1602 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1604 // increment outIndex for active lanes
1605 vOutIndex
= _simd16_blendv_epi32(vOutIndex
, _simd16_add_epi32(vOutIndex
, _simd16_set1_epi32(1)), intersectMask
);
1608 // increment loop index and update active mask
1609 vCurIndex
= _simd16_add_epi32(vCurIndex
, _simd16_set1_epi32(1));
1610 vActiveMask
= _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex
, vNumInPts
));
1617 template<SWR_CLIPCODES ClippingPlane
>
1618 simdscalari
ClipLineToPlane(const float* pInVerts
, const simdscalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
1620 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1622 simdscalari vCurIndex
= _simd_setzero_si();
1623 simdscalari vOutIndex
= _simd_setzero_si();
1624 simdscalar vActiveMask
= _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex
, vNumInPts
));
1626 if (!_simd_testz_ps(vActiveMask
, vActiveMask
))
1628 simdscalari s
= vCurIndex
;
1629 simdscalari p
= _simd_add_epi32(s
, _simd_set1_epi32(1));
1632 simdvector vInPos0
, vInPos1
;
1633 for (uint32_t c
= 0; c
< 4; ++c
)
1635 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1636 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1639 // compute inside mask
1640 simdscalar s_in
= inside
<ClippingPlane
>(vInPos0
);
1641 simdscalar p_in
= inside
<ClippingPlane
>(vInPos1
);
1643 // compute intersection mask (s_in != p_in)
1644 simdscalar intersectMask
= _simd_xor_ps(s_in
, p_in
);
1645 intersectMask
= _simd_and_ps(intersectMask
, vActiveMask
);
1647 // store s if inside
1648 s_in
= _simd_and_ps(s_in
, vActiveMask
);
1649 if (!_simd_testz_ps(s_in
, s_in
))
1651 for (uint32_t c
= 0; c
< 4; ++c
)
1653 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1656 // interpolate attributes and store
1657 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1659 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1660 for (uint32_t c
= 0; c
< 4; ++c
)
1662 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1663 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1667 // increment outIndex
1668 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), s_in
);
1671 // compute and store intersection
1672 if (!_simd_testz_ps(intersectMask
, intersectMask
))
1674 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1676 // increment outIndex for active lanes
1677 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), intersectMask
);
1680 // store p if inside
1681 p_in
= _simd_and_ps(p_in
, vActiveMask
);
1682 if (!_simd_testz_ps(p_in
, p_in
))
1684 for (uint32_t c
= 0; c
< 4; ++c
)
1686 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
1689 // interpolate attributes and store
1690 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1692 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1693 for (uint32_t c
= 0; c
< 4; ++c
)
1695 simdscalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
1696 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
1700 // increment outIndex
1701 vOutIndex
= _simd_blendv_epi32(vOutIndex
, _simd_add_epi32(vOutIndex
, _simd_set1_epi32(1)), p_in
);
1708 #if USE_SIMD16_FRONTEND
1709 template<SWR_CLIPCODES ClippingPlane
>
1710 simd16scalari
ClipLineToPlane(const float* pInVerts
, const simd16scalari
& vNumInPts
, uint32_t numInAttribs
, float* pOutVerts
)
1712 uint32_t vertexAttribOffset
= this->state
.backendState
.vertexAttribOffset
;
1714 simd16scalari vCurIndex
= _simd16_setzero_si();
1715 simd16scalari vOutIndex
= _simd16_setzero_si();
1716 simd16scalar vActiveMask
= _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex
, vNumInPts
));
1718 if (!_simd16_testz_ps(vActiveMask
, vActiveMask
))
1720 simd16scalari s
= vCurIndex
;
1721 simd16scalari p
= _simd16_add_epi32(s
, _simd16_set1_epi32(1));
1724 simd16vector vInPos0
, vInPos1
;
1725 for (uint32_t c
= 0; c
< 4; ++c
)
1727 vInPos0
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, s
, c
);
1728 vInPos1
[c
] = GatherComponent(pInVerts
, VERTEX_POSITION_SLOT
, vActiveMask
, p
, c
);
1731 // compute inside mask
1732 simd16scalar s_in
= inside
<ClippingPlane
>(vInPos0
);
1733 simd16scalar p_in
= inside
<ClippingPlane
>(vInPos1
);
1735 // compute intersection mask (s_in != p_in)
1736 simd16scalar intersectMask
= _simd16_xor_ps(s_in
, p_in
);
1737 intersectMask
= _simd16_and_ps(intersectMask
, vActiveMask
);
1739 // store s if inside
1740 s_in
= _simd16_and_ps(s_in
, vActiveMask
);
1741 if (!_simd16_testz_ps(s_in
, s_in
))
1743 for (uint32_t c
= 0; c
< 4; ++c
)
1745 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, s_in
, vOutIndex
, c
, vInPos0
[c
]);
1748 // interpolate attributes and store
1749 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1751 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1752 for (uint32_t c
= 0; c
< 4; ++c
)
1754 simd16scalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, s_in
, s
, c
);
1755 ScatterComponent(pOutVerts
, attribSlot
, s_in
, vOutIndex
, c
, vAttrib
);
1759 // increment outIndex
1760 vOutIndex
= _simd16_blendv_epi32(vOutIndex
, _simd16_add_epi32(vOutIndex
, _simd16_set1_epi32(1)), s_in
);
1763 // compute and store intersection
1764 if (!_simd16_testz_ps(intersectMask
, intersectMask
))
1766 intersect
<ClippingPlane
>(intersectMask
, s
, p
, vInPos0
, vInPos1
, vOutIndex
, pInVerts
, numInAttribs
, pOutVerts
);
1768 // increment outIndex for active lanes
1769 vOutIndex
= _simd16_blendv_epi32(vOutIndex
, _simd16_add_epi32(vOutIndex
, _simd16_set1_epi32(1)), intersectMask
);
1772 // store p if inside
1773 p_in
= _simd16_and_ps(p_in
, vActiveMask
);
1774 if (!_simd16_testz_ps(p_in
, p_in
))
1776 for (uint32_t c
= 0; c
< 4; ++c
)
1778 ScatterComponent(pOutVerts
, VERTEX_POSITION_SLOT
, p_in
, vOutIndex
, c
, vInPos1
[c
]);
1781 // interpolate attributes and store
1782 for (uint32_t a
= 0; a
< numInAttribs
; ++a
)
1784 uint32_t attribSlot
= vertexAttribOffset
+ a
;
1785 for (uint32_t c
= 0; c
< 4; ++c
)
1787 simd16scalar vAttrib
= GatherComponent(pInVerts
, attribSlot
, p_in
, p
, c
);
1788 ScatterComponent(pOutVerts
, attribSlot
, p_in
, vOutIndex
, c
, vAttrib
);
1792 // increment outIndex
1793 vOutIndex
= _simd16_blendv_epi32(vOutIndex
, _simd16_add_epi32(vOutIndex
, _simd16_set1_epi32(1)), p_in
);
1800 //////////////////////////////////////////////////////////////////////////
1801 /// @brief Vertical clipper. Clips SIMD primitives at a time
1802 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
1803 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
1804 /// @param numAttribs - number of valid input attribs, including position
1805 simdscalari
ClipPrims(float* pVertices
, const simdscalar
& vPrimMask
, const simdscalar
& vClipMask
, int numAttribs
)
1808 float* pTempVerts
= (float*)&tlsTempVertices
[0];
1810 // zero out num input verts for non-active lanes
1811 simdscalari vNumInPts
= _simd_set1_epi32(NumVertsPerPrim
);
1812 vNumInPts
= _simd_blendv_epi32(_simd_setzero_si(), vNumInPts
, vClipMask
);
1814 // clip prims to frustum
1815 simdscalari vNumOutPts
;
1816 if (NumVertsPerPrim
== 3)
1818 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1819 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1820 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1821 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1822 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1823 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1827 SWR_ASSERT(NumVertsPerPrim
== 2);
1828 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1829 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1830 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1831 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1832 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1833 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1836 // restore num verts for non-clipped, active lanes
1837 simdscalar vNonClippedMask
= _simd_andnot_ps(vClipMask
, vPrimMask
);
1838 vNumOutPts
= _simd_blendv_epi32(vNumOutPts
, _simd_set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
1843 #if USE_SIMD16_FRONTEND
1844 simd16scalari
ClipPrims(float* pVertices
, const simd16scalar
& vPrimMask
, const simd16scalar
& vClipMask
, int numAttribs
)
1847 float* pTempVerts
= (float*)&tlsTempVertices_simd16
[0];
1849 // zero out num input verts for non-active lanes
1850 simd16scalari vNumInPts
= _simd16_set1_epi32(NumVertsPerPrim
);
1851 vNumInPts
= _simd16_blendv_epi32(_simd16_setzero_si(), vNumInPts
, vClipMask
);
1853 // clip prims to frustum
1854 simd16scalari vNumOutPts
;
1855 if (NumVertsPerPrim
== 3)
1857 vNumOutPts
= ClipTriToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1858 vNumOutPts
= ClipTriToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1859 vNumOutPts
= ClipTriToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1860 vNumOutPts
= ClipTriToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1861 vNumOutPts
= ClipTriToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1862 vNumOutPts
= ClipTriToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1866 SWR_ASSERT(NumVertsPerPrim
== 2);
1867 vNumOutPts
= ClipLineToPlane
<FRUSTUM_NEAR
>(pVertices
, vNumInPts
, numAttribs
, pTempVerts
);
1868 vNumOutPts
= ClipLineToPlane
<FRUSTUM_FAR
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1869 vNumOutPts
= ClipLineToPlane
<FRUSTUM_LEFT
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1870 vNumOutPts
= ClipLineToPlane
<FRUSTUM_RIGHT
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1871 vNumOutPts
= ClipLineToPlane
<FRUSTUM_BOTTOM
>(pVertices
, vNumOutPts
, numAttribs
, pTempVerts
);
1872 vNumOutPts
= ClipLineToPlane
<FRUSTUM_TOP
>(pTempVerts
, vNumOutPts
, numAttribs
, pVertices
);
1875 // restore num verts for non-clipped, active lanes
1876 simd16scalar vNonClippedMask
= _simd16_andnot_ps(vClipMask
, vPrimMask
);
1877 vNumOutPts
= _simd16_blendv_epi32(vNumOutPts
, _simd16_set1_epi32(NumVertsPerPrim
), vNonClippedMask
);
1883 const uint32_t workerId
{ 0 };
1884 DRAW_CONTEXT
* pDC
{ nullptr };
1885 const API_STATE
& state
;
1886 simdscalar clipCodes
[NumVertsPerPrim
];
1887 #if USE_SIMD16_FRONTEND
1888 simd16scalar clipCodes_simd16
[NumVertsPerPrim
];
1893 // pipeline stage functions
1894 void ClipTriangles(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
);
1895 void ClipLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
);
1896 void ClipPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[], uint32_t primMask
, simdscalari primId
);
1897 #if USE_SIMD16_FRONTEND
1898 void SIMDCALL
ClipTriangles_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari primId
);
1899 void SIMDCALL
ClipLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari primId
);
1900 void SIMDCALL
ClipPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[], uint32_t primMask
, simd16scalari primId
);