1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
31 #include "common/simdintrin.h"
32 #include <type_traits>
34 // Calculates the A and B coefficients for the 3 edges of the triangle
36 // maths for edge equations:
37 // standard form of a line in 2d
43 void triangleSetupAB(const __m128 vX
, const __m128 vY
, __m128
& vA
, __m128
& vB
)
45 // vYsub = y1 y2 y0 dc
46 __m128 vYsub
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(3, 0, 2, 1));
48 vA
= _mm_sub_ps(vY
, vYsub
);
55 // vXsub = x1 x2 x0 dc
56 __m128 vXsub
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(3, 0, 2, 1));
58 vB
= _mm_sub_ps(vXsub
, vX
);
67 void triangleSetupABInt(const __m128i vX
, const __m128i vY
, __m128i
& vA
, __m128i
& vB
)
69 // generate edge equations
73 __m128i vYsub
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 2, 1));
74 vA
= _mm_sub_epi32(vY
, vYsub
);
76 __m128i vXsub
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 2, 1));
77 vB
= _mm_sub_epi32(vXsub
, vX
);
81 void triangleSetupABIntVertical(const simdscalari vX
[3], const simdscalari vY
[3], simdscalari (&vA
)[3], simdscalari (&vB
)[3])
85 vA
[0] = _simd_sub_epi32(vY
[0], vY
[1]);
86 vA
[1] = _simd_sub_epi32(vY
[1], vY
[2]);
87 vA
[2] = _simd_sub_epi32(vY
[2], vY
[0]);
89 vB
[0] = _simd_sub_epi32(vX
[1], vX
[0]);
90 vB
[1] = _simd_sub_epi32(vX
[2], vX
[1]);
91 vB
[2] = _simd_sub_epi32(vX
[0], vX
[2]);
94 #if ENABLE_AVX512_SIMD16
96 void triangleSetupABIntVertical(const simd16scalari vX
[3], const simd16scalari vY
[3], simd16scalari(&vA
)[3], simd16scalari(&vB
)[3])
100 vA
[0] = _simd16_sub_epi32(vY
[0], vY
[1]);
101 vA
[1] = _simd16_sub_epi32(vY
[1], vY
[2]);
102 vA
[2] = _simd16_sub_epi32(vY
[2], vY
[0]);
104 vB
[0] = _simd16_sub_epi32(vX
[1], vX
[0]);
105 vB
[1] = _simd16_sub_epi32(vX
[2], vX
[1]);
106 vB
[2] = _simd16_sub_epi32(vX
[0], vX
[2]);
110 // Calculate the determinant of the triangle
111 // 2 vectors between the 3 points: P, Q
112 // Px = x0-x2, Py = y0-y2
113 // Qx = x1-x2, Qy = y1-y2
115 // det = | | = PxQy - PyQx
117 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
118 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
119 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
120 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
121 // : B[2]*A[1] - A[2]*B[1]
123 float calcDeterminantInt(const __m128i vA
, const __m128i vB
)
125 // vAShuf = [A1, A0, A2, A0]
126 __m128i vAShuf
= _mm_shuffle_epi32(vA
, _MM_SHUFFLE(0, 2, 0, 1));
127 // vBShuf = [B2, B0, B1, B0]
128 __m128i vBShuf
= _mm_shuffle_epi32(vB
, _MM_SHUFFLE(0, 1, 0, 2));
129 // vMul = [A1*B2, B1*A2]
130 __m128i vMul
= _mm_mul_epi32(vAShuf
, vBShuf
);
132 // shuffle upper to lower
133 // vMul2 = [B1*A2, B1*A2]
134 __m128i vMul2
= _mm_shuffle_epi32(vMul
, _MM_SHUFFLE(3, 2, 3, 2));
135 //vMul = [A1*B2 - B1*A2]
136 vMul
= _mm_sub_epi64(vMul
, vMul2
);
139 _mm_store_sd((double*)&result
, _mm_castsi128_pd(vMul
));
141 double dResult
= (double)result
;
142 dResult
= dResult
* (1.0 / FIXED_POINT16_SCALE
);
144 return (float)dResult
;
148 void calcDeterminantIntVertical(const simdscalari vA
[3], const simdscalari vB
[3], simdscalari
*pvDet
)
150 // refer to calcDeterminantInt comment for calculation explanation
153 simdscalari vA1Lo
= _simd_unpacklo_epi32(vA
[1], vA
[1]); // 0 0 1 1 4 4 5 5
154 simdscalari vA1Hi
= _simd_unpackhi_epi32(vA
[1], vA
[1]); // 2 2 3 3 6 6 7 7
156 simdscalari vB2Lo
= _simd_unpacklo_epi32(vB
[2], vB
[2]);
157 simdscalari vB2Hi
= _simd_unpackhi_epi32(vB
[2], vB
[2]);
159 simdscalari vA1B2Lo
= _simd_mul_epi32(vA1Lo
, vB2Lo
); // 0 1 4 5
160 simdscalari vA1B2Hi
= _simd_mul_epi32(vA1Hi
, vB2Hi
); // 2 3 6 7
163 simdscalari vA2Lo
= _simd_unpacklo_epi32(vA
[2], vA
[2]);
164 simdscalari vA2Hi
= _simd_unpackhi_epi32(vA
[2], vA
[2]);
166 simdscalari vB1Lo
= _simd_unpacklo_epi32(vB
[1], vB
[1]);
167 simdscalari vB1Hi
= _simd_unpackhi_epi32(vB
[1], vB
[1]);
169 simdscalari vA2B1Lo
= _simd_mul_epi32(vA2Lo
, vB1Lo
);
170 simdscalari vA2B1Hi
= _simd_mul_epi32(vA2Hi
, vB1Hi
);
173 simdscalari detLo
= _simd_sub_epi64(vA1B2Lo
, vA2B1Lo
);
174 simdscalari detHi
= _simd_sub_epi64(vA1B2Hi
, vA2B1Hi
);
176 // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
177 simdscalari vResultLo
= _simd_permute2f128_si(detLo
, detHi
, 0x20);
179 // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
180 simdscalari vResultHi
= _simd_permute2f128_si(detLo
, detHi
, 0x31);
182 pvDet
[0] = vResultLo
;
183 pvDet
[1] = vResultHi
;
186 #if ENABLE_AVX512_SIMD16
188 void calcDeterminantIntVertical(const simd16scalari vA
[3], const simd16scalari vB
[3], simd16scalari
*pvDet
)
190 // refer to calcDeterminantInt comment for calculation explanation
193 simd16scalari vA1_lo
= _simd16_unpacklo_epi32(vA
[1], vA
[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
194 simd16scalari vA1_hi
= _simd16_unpackhi_epi32(vA
[1], vA
[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
196 simd16scalari vB2_lo
= _simd16_unpacklo_epi32(vB
[2], vB
[2]);
197 simd16scalari vB2_hi
= _simd16_unpackhi_epi32(vB
[2], vB
[2]);
199 simd16scalari vA1B2_lo
= _simd16_mul_epi32(vA1_lo
, vB2_lo
); // 0 1 4 5 8 9 C D (64b)
200 simd16scalari vA1B2_hi
= _simd16_mul_epi32(vA1_hi
, vB2_hi
); // 2 3 6 7 A B E F
203 simd16scalari vA2_lo
= _simd16_unpacklo_epi32(vA
[2], vA
[2]);
204 simd16scalari vA2_hi
= _simd16_unpackhi_epi32(vA
[2], vA
[2]);
206 simd16scalari vB1_lo
= _simd16_unpacklo_epi32(vB
[1], vB
[1]);
207 simd16scalari vB1_hi
= _simd16_unpackhi_epi32(vB
[1], vB
[1]);
209 simd16scalari vA2B1_lo
= _simd16_mul_epi32(vA2_lo
, vB1_lo
);
210 simd16scalari vA2B1_hi
= _simd16_mul_epi32(vA2_hi
, vB1_hi
);
213 simd16scalari difflo
= _simd16_sub_epi64(vA1B2_lo
, vA2B1_lo
); // 0 1 4 5 8 9 C D (64b)
214 simd16scalari diffhi
= _simd16_sub_epi64(vA1B2_hi
, vA2B1_hi
); // 2 3 6 7 A B E F
216 // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
217 simd16scalari templo
= _simd16_permute2f128_si(difflo
, diffhi
, 0x44); // 0 1 4 5 2 3 6 7 (64b)
218 simd16scalari temphi
= _simd16_permute2f128_si(difflo
, diffhi
, 0xEE); // 8 9 C D A B E F
220 // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
221 pvDet
[0] = _simd16_permute2f128_si(templo
, templo
, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
222 pvDet
[1] = _simd16_permute2f128_si(temphi
, temphi
, 0xD8); // 8 9 A B C D E F
227 void triangleSetupC(const __m128 vX
, const __m128 vY
, const __m128 vA
, const __m128
&vB
, __m128
&vC
)
230 vC
= _mm_mul_ps(vA
, vX
);
231 __m128 vCy
= _mm_mul_ps(vB
, vY
);
232 vC
= _mm_mul_ps(vC
, _mm_set1_ps(-1.0f
));
233 vC
= _mm_sub_ps(vC
, vCy
);
236 template<uint32_t NumVerts
>
238 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
240 simdscalar m00
= _simd_load1_ps(&vpMatrices
.m00
[0]);
241 simdscalar m30
= _simd_load1_ps(&vpMatrices
.m30
[0]);
242 simdscalar m11
= _simd_load1_ps(&vpMatrices
.m11
[0]);
243 simdscalar m31
= _simd_load1_ps(&vpMatrices
.m31
[0]);
244 simdscalar m22
= _simd_load1_ps(&vpMatrices
.m22
[0]);
245 simdscalar m32
= _simd_load1_ps(&vpMatrices
.m32
[0]);
247 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
249 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
250 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
251 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
255 #if USE_SIMD16_FRONTEND
256 template<uint32_t NumVerts
>
258 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
260 const simd16scalar m00
= _simd16_broadcast_ss(&vpMatrices
.m00
[0]);
261 const simd16scalar m30
= _simd16_broadcast_ss(&vpMatrices
.m30
[0]);
262 const simd16scalar m11
= _simd16_broadcast_ss(&vpMatrices
.m11
[0]);
263 const simd16scalar m31
= _simd16_broadcast_ss(&vpMatrices
.m31
[0]);
264 const simd16scalar m22
= _simd16_broadcast_ss(&vpMatrices
.m22
[0]);
265 const simd16scalar m32
= _simd16_broadcast_ss(&vpMatrices
.m32
[0]);
267 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
269 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
270 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
271 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
276 template<uint32_t NumVerts
>
278 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simdscalari
const &vViewportIdx
)
280 // perform a gather of each matrix element based on the viewport array indexes
281 simdscalar m00
= _simd_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
282 simdscalar m30
= _simd_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
283 simdscalar m11
= _simd_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
284 simdscalar m31
= _simd_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
285 simdscalar m22
= _simd_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
286 simdscalar m32
= _simd_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
288 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
290 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
291 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
292 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
296 #if USE_SIMD16_FRONTEND
297 template<uint32_t NumVerts
>
299 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simd16scalari
const &vViewportIdx
)
301 // perform a gather of each matrix element based on the viewport array indexes
302 const simd16scalar m00
= _simd16_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
303 const simd16scalar m30
= _simd16_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
304 const simd16scalar m11
= _simd16_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
305 const simd16scalar m31
= _simd16_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
306 const simd16scalar m22
= _simd16_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
307 const simd16scalar m32
= _simd16_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
309 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
311 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
312 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
313 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
319 void calcBoundingBoxInt(const __m128i
&vX
, const __m128i
&vY
, SWR_RECT
&bbox
)
321 // Need horizontal fp min here
322 __m128i vX1
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 2, 0, 1));
323 __m128i vX2
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 1, 2));
325 __m128i vY1
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 2, 0, 1));
326 __m128i vY2
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 1, 2));
329 __m128i vMinX
= _mm_min_epi32(vX
, vX1
);
330 vMinX
= _mm_min_epi32(vMinX
, vX2
);
332 __m128i vMaxX
= _mm_max_epi32(vX
, vX1
);
333 vMaxX
= _mm_max_epi32(vMaxX
, vX2
);
335 __m128i vMinY
= _mm_min_epi32(vY
, vY1
);
336 vMinY
= _mm_min_epi32(vMinY
, vY2
);
338 __m128i vMaxY
= _mm_max_epi32(vY
, vY1
);
339 vMaxY
= _mm_max_epi32(vMaxY
, vY2
);
341 bbox
.xmin
= _mm_extract_epi32(vMinX
, 0);
342 bbox
.xmax
= _mm_extract_epi32(vMaxX
, 0);
343 bbox
.ymin
= _mm_extract_epi32(vMinY
, 0);
344 bbox
.ymax
= _mm_extract_epi32(vMaxY
, 0);
348 bool CanUseSimplePoints(DRAW_CONTEXT
*pDC
)
350 const API_STATE
& state
= GetApiState(pDC
);
352 return (state
.rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&&
353 state
.rastState
.pointSize
== 1.0f
&&
354 !state
.rastState
.pointParam
&&
355 !state
.rastState
.pointSpriteEnable
&&
356 !state
.backendState
.clipDistanceMask
);
360 bool vHasNaN(const __m128
& vec
)
362 const __m128 result
= _mm_cmpunord_ps(vec
, vec
);
363 const int32_t mask
= _mm_movemask_ps(result
);
367 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numElements
);
368 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
);
371 // ProcessDraw front-end function. All combinations of parameter values are available
372 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
374 bool IsCutIndexEnabled
,
375 bool HasTessellation
,
376 bool HasGeometryShader
,
378 bool HasRasterization
);
380 void ProcessClear(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
381 void ProcessStoreTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
382 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
383 void ProcessSync(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
384 void ProcessShutdown(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
386 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
);
387 #if USE_SIMD16_FRONTEND
388 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
);
391 struct PA_STATE_BASE
; // forward decl
392 void BinPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari
const &primID
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
393 void BinLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari
const &primID
, simdscalari
const &viewportIdx
, simdscalari
const &rtIdx
);
394 #if USE_SIMD16_FRONTEND
395 void SIMDCALL
BinPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari
const &primID
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);
396 void SIMDCALL
BinLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari
const &primID
, simd16scalari
const &viewportIdx
, simd16scalari
const &rtIdx
);