swr/rast: add memory api to SwrGetInterface()
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.h
24 *
25 * @brief Definitions for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29 #pragma once
30 #include "context.h"
31 #include <type_traits>
32
33 #if ENABLE_AVX512_SIMD16
34 // TODO: this belongs in state.h alongside the simdvector definition, but there is a llvm codegen issue
35 struct simd16vertex
36 {
37 simd16vector attrib[SWR_VTX_NUM_SLOTS];
38 };
39
40 #endif
41 // Calculates the A and B coefficients for the 3 edges of the triangle
42 //
43 // maths for edge equations:
44 // standard form of a line in 2d
45 // Ax + By + C = 0
46 // A = y0 - y1
47 // B = x1 - x0
48 // C = x0y1 - x1y0
49 INLINE
50 void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
51 {
52 // vYsub = y1 y2 y0 dc
53 __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
54 // vY = y0 y1 y2 dc
55 vA = _mm_sub_ps(vY, vYsub);
56
57 // Result:
58 // A[0] = y0 - y1
59 // A[1] = y1 - y2
60 // A[2] = y2 - y0
61
62 // vXsub = x1 x2 x0 dc
63 __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
64 // vX = x0 x1 x2 dc
65 vB = _mm_sub_ps(vXsub, vX);
66
67 // Result:
68 // B[0] = x1 - x0
69 // B[1] = x2 - x1
70 // B[2] = x0 - x2
71 }
72
73 INLINE
74 void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
75 {
76 // generate edge equations
77 // A = y0 - y1
78 // B = x1 - x0
79 vA[0] = _simd_sub_ps(vY[0], vY[1]);
80 vA[1] = _simd_sub_ps(vY[1], vY[2]);
81 vA[2] = _simd_sub_ps(vY[2], vY[0]);
82
83 vB[0] = _simd_sub_ps(vX[1], vX[0]);
84 vB[1] = _simd_sub_ps(vX[2], vX[1]);
85 vB[2] = _simd_sub_ps(vX[0], vX[2]);
86 }
87
88 INLINE
89 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
90 {
91 // generate edge equations
92 // A = y0 - y1
93 // B = x1 - x0
94 // C = x0y1 - x1y0
95 __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
96 vA = _mm_sub_epi32(vY, vYsub);
97
98 __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
99 vB = _mm_sub_epi32(vXsub, vX);
100 }
101
102 INLINE
103 void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
104 {
105 // A = y0 - y1
106 // B = x1 - x0
107 vA[0] = _simd_sub_epi32(vY[0], vY[1]);
108 vA[1] = _simd_sub_epi32(vY[1], vY[2]);
109 vA[2] = _simd_sub_epi32(vY[2], vY[0]);
110
111 vB[0] = _simd_sub_epi32(vX[1], vX[0]);
112 vB[1] = _simd_sub_epi32(vX[2], vX[1]);
113 vB[2] = _simd_sub_epi32(vX[0], vX[2]);
114 }
115
116 #if ENABLE_AVX512_SIMD16
117 INLINE
118 void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3])
119 {
120 // A = y0 - y1
121 // B = x1 - x0
122 vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
123 vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
124 vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
125
126 vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
127 vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
128 vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
129 }
130
131 #endif
132 // Calculate the determinant of the triangle
133 // 2 vectors between the 3 points: P, Q
134 // Px = x0-x2, Py = y0-y2
135 // Qx = x1-x2, Qy = y1-y2
136 // |Px Qx|
137 // det = | | = PxQy - PyQx
138 // |Py Qy|
139 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
140 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
141 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
142 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
143 // : B[2]*A[1] - A[2]*B[1]
144 INLINE
145 float calcDeterminantInt(const __m128i vA, const __m128i vB)
146 {
147 // vAShuf = [A1, A0, A2, A0]
148 __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
149 // vBShuf = [B2, B0, B1, B0]
150 __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
151 // vMul = [A1*B2, B1*A2]
152 __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
153
154 // shuffle upper to lower
155 // vMul2 = [B1*A2, B1*A2]
156 __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
157 //vMul = [A1*B2 - B1*A2]
158 vMul = _mm_sub_epi64(vMul, vMul2);
159
160 int64_t result;
161 _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
162
163 double dResult = (double)result;
164 dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
165
166 return (float)dResult;
167 }
168
169 INLINE
170 void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
171 {
172 // refer to calcDeterminantInt comment for calculation explanation
173 // A1*B2
174 simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
175 simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
176
177 simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
178 simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
179
180 simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
181 simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
182
183 // B1*A2
184 simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
185 simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
186
187 simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
188 simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
189
190 simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
191 simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
192
193 // A1*B2 - A2*B1
194 simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
195 simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
196
197 // shuffle 0 1 4 5 -> 0 1 2 3
198 simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
199 simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
200
201 pvDet[0] = vResultLo;
202 pvDet[1] = vResultHi;
203 }
204
205 #if ENABLE_AVX512_SIMD16
206 INLINE
207 void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet)
208 {
209 // refer to calcDeterminantInt comment for calculation explanation
210 // A1*B2
211
212 #if 1
213 // TODO: get the native SIMD16 version working..
214
215 simdscalari vA_lo[3];
216 simdscalari vA_hi[3];
217 simdscalari vB_lo[3];
218 simdscalari vB_hi[3];
219
220 for (uint32_t i = 0; i < 3; i += 1)
221 {
222 vA_lo[i] = _simd16_extract_si(vA[i], 0);
223 vA_hi[i] = _simd16_extract_si(vA[i], 1);
224 vB_lo[i] = _simd16_extract_si(vB[i], 0);
225 vB_hi[i] = _simd16_extract_si(vB[i], 1);
226 }
227
228 calcDeterminantIntVertical(vA_lo, vB_lo, reinterpret_cast<simdscalari *>(&pvDet[0]));
229 calcDeterminantIntVertical(vA_hi, vB_hi, reinterpret_cast<simdscalari *>(&pvDet[1]));
230 #else
231 simd16scalari vA1Lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 8 8 9 9 C C D D
232 simd16scalari vA1Hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 A A B B E E F F
233
234 simd16scalari vB2Lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
235 simd16scalari vB2Hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
236
237 simd16scalari vA1B2Lo = _simd16_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 8 9 C D
238 simd16scalari vA1B2Hi = _simd16_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 A B E F
239
240 // B1*A2
241 simd16scalari vA2Lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
242 simd16scalari vA2Hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
243
244 simd16scalari vB1Lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
245 simd16scalari vB1Hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
246
247 simd16scalari vA2B1Lo = _simd16_mul_epi32(vA2Lo, vB1Lo);
248 simd16scalari vA2B1Hi = _simd16_mul_epi32(vA2Hi, vB1Hi);
249
250 // A1*B2 - A2*B1
251 simd16scalari detLo = _simd16_sub_epi64(vA1B2Lo, vA2B1Lo);
252 simd16scalari detHi = _simd16_sub_epi64(vA1B2Hi, vA2B1Hi);
253
254 // shuffle 0 1 4 5 -> 0 1 2 3
255 simd16scalari vResultLo = _simd16_permute2f128_si(detLo, detHi, 0x20);
256 simd16scalari vResultHi = _simd16_permute2f128_si(detLo, detHi, 0x31);
257
258 pvDet[0] = vResultLo;
259 pvDet[1] = vResultHi;
260 #endif
261 }
262
263 #endif
264 INLINE
265 void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
266 {
267 // C = -Ax - By
268 vC = _mm_mul_ps(vA, vX);
269 __m128 vCy = _mm_mul_ps(vB, vY);
270 vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
271 vC = _mm_sub_ps(vC, vCy);
272 }
273
274 INLINE
275 void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
276 {
277 vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
278 vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
279
280 vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
281 vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
282
283 vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
284 vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
285 }
286
287 template<uint32_t NumVerts>
288 INLINE
289 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
290 {
291 simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
292 simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
293 simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
294 simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
295 simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
296 simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
297
298 for (uint32_t i = 0; i < NumVerts; ++i)
299 {
300 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
301 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
302 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
303 }
304 }
305
306 #if USE_SIMD16_FRONTEND
307 template<uint32_t NumVerts>
308 INLINE
309 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
310 {
311 const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
312 const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
313 const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
314 const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
315 const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
316 const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
317
318 for (uint32_t i = 0; i < NumVerts; ++i)
319 {
320 v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
321 v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
322 v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
323 }
324 }
325
326 #endif
327 template<uint32_t NumVerts>
328 INLINE
329 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx)
330 {
331 // perform a gather of each matrix element based on the viewport array indexes
332 simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
333 simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
334 simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
335 simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
336 simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
337 simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
338
339 for (uint32_t i = 0; i < NumVerts; ++i)
340 {
341 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
342 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
343 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
344 }
345 }
346
347 #if USE_SIMD16_FRONTEND
348 template<uint32_t NumVerts>
349 INLINE
350 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari vViewportIdx)
351 {
352 // perform a gather of each matrix element based on the viewport array indexes
353 const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
354 const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
355 const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
356 const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
357 const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
358 const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
359
360 for (uint32_t i = 0; i < NumVerts; ++i)
361 {
362 v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
363 v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
364 v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
365 }
366 }
367
368 #endif
369 INLINE
370 void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox)
371 {
372 // Need horizontal fp min here
373 __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
374 __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
375
376 __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
377 __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
378
379
380 __m128i vMinX = _mm_min_epi32(vX, vX1);
381 vMinX = _mm_min_epi32(vMinX, vX2);
382
383 __m128i vMaxX = _mm_max_epi32(vX, vX1);
384 vMaxX = _mm_max_epi32(vMaxX, vX2);
385
386 __m128i vMinY = _mm_min_epi32(vY, vY1);
387 vMinY = _mm_min_epi32(vMinY, vY2);
388
389 __m128i vMaxY = _mm_max_epi32(vY, vY1);
390 vMaxY = _mm_max_epi32(vMaxY, vY2);
391
392 bbox.xmin = _mm_extract_epi32(vMinX, 0);
393 bbox.xmax = _mm_extract_epi32(vMaxX, 0);
394 bbox.ymin = _mm_extract_epi32(vMinY, 0);
395 bbox.ymax = _mm_extract_epi32(vMaxY, 0);
396 }
397
398 INLINE
399 bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
400 {
401 const API_STATE& state = GetApiState(pDC);
402
403 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
404 state.rastState.pointSize == 1.0f &&
405 !state.rastState.pointParam &&
406 !state.rastState.pointSpriteEnable);
407 }
408
409 INLINE
410 bool vHasNaN(const __m128& vec)
411 {
412 const __m128 result = _mm_cmpunord_ps(vec, vec);
413 const int32_t mask = _mm_movemask_ps(result);
414 return (mask != 0);
415 }
416
417 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
418 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
419
420
421 // ProcessDraw front-end function. All combinations of parameter values are available
422 PFN_FE_WORK_FUNC GetProcessDrawFunc(
423 bool IsIndexed,
424 bool IsCutIndexEnabled,
425 bool HasTessellation,
426 bool HasGeometryShader,
427 bool HasStreamOut,
428 bool HasRasterization);
429
430 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
431 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
432 void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
433 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
434 void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
435
436 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
437 #if USE_SIMD16_FRONTEND
438 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
439 #endif
440
441 struct PA_STATE_BASE; // forward decl
442 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
443 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
444 #if USE_SIMD16_FRONTEND
445 void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
446 void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
447 #endif
448