llvmpipe: better triangle debugging
[mesa.git] / src / gallium / drivers / llvmpipe / sse_mathfun.h
1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2
3 Inspired by Intel Approximate Math library, and based on the
4 corresponding algorithms of the cephes math library
5
6 The default is to use the SSE1 version. If you define USE_SSE2 the
7 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8 not expect any significant performance improvement with SSE2.
9 */
10
11 /* Copyright (C) 2007 Julien Pommier
12
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
16
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
20
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
28
29 (this is the zlib license)
30 */
31
32 #include <xmmintrin.h>
33
34 /* yes I know, the top of this file is quite ugly */
35
36 #ifdef _MSC_VER /* visual c++ */
37 # define ALIGN16_BEG __declspec(align(16))
38 # define ALIGN16_END
39 #else /* gcc or icc */
40 # define ALIGN16_BEG
41 # define ALIGN16_END __attribute__((aligned(16)))
42 #endif
43
44 /* __m128 is ugly to write */
45 typedef __m128 v4sf; // vector of 4 float (sse1)
46
47 #ifdef USE_SSE2
48 # include <emmintrin.h>
49 typedef __m128i v4si; // vector of 4 int (sse2)
50 #else
51 typedef __m64 v2si; // vector of 2 int (mmx)
52 #endif
53
54 /* declare some SSE constants -- why can't I figure a better way to do that? */
55 #define _PS_CONST(Name, Val) \
56 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
57 #define _PI32_CONST(Name, Val) \
58 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
59 #define _PS_CONST_TYPE(Name, Type, Val) \
60 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
61
62 _PS_CONST(1 , 1.0f);
63 _PS_CONST(0p5, 0.5f);
64 /* the smallest non denormalized float number */
65 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
66 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
67 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
68
69 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
70 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
71
72 _PI32_CONST(1, 1);
73 _PI32_CONST(inv1, ~1);
74 _PI32_CONST(2, 2);
75 _PI32_CONST(4, 4);
76 _PI32_CONST(0x7f, 0x7f);
77
78 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
79 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
80 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
81 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
82 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
83 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
84 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
85 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
86 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
87 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
88 _PS_CONST(cephes_log_q1, -2.12194440e-4);
89 _PS_CONST(cephes_log_q2, 0.693359375);
90
91 v4sf log_ps(v4sf x);
92 v4sf exp_ps(v4sf x);
93 v4sf sin_ps(v4sf x);
94 v4sf cos_ps(v4sf x);
95 void sincos_ps(v4sf x, v4sf *s, v4sf *c);
96
97 #if defined (__MINGW32__)
98
99 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
100 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
101 broken on my mingw gcc 3.4.5 ...
102
103 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
104 */
105
106 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
107 asm (
108 "movhlps %2,%0\n\t"
109 : "=x" (a)
110 : "0" (a), "x"(b)
111 );
112 return a; }
113 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
114 #define _mm_movehl_ps my_movehl_ps
115
116 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
117 asm (
118 "cmpltps %2,%0\n\t"
119 : "=x" (a)
120 : "0" (a), "x"(b)
121 );
122 return a;
123 }
124 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
125 asm (
126 "cmpnleps %2,%0\n\t"
127 : "=x" (a)
128 : "0" (a), "x"(b)
129 );
130 return a;
131 }
132 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
133 asm (
134 "cmpeqps %2,%0\n\t"
135 : "=x" (a)
136 : "0" (a), "x"(b)
137 );
138 return a;
139 }
140 #warning "redefined _mm_cmpxx_ps functions..."
141 #define _mm_cmplt_ps my_cmplt_ps
142 #define _mm_cmpgt_ps my_cmpgt_ps
143 #define _mm_cmpeq_ps my_cmpeq_ps
144 #endif
145
146 #ifndef USE_SSE2
147 typedef union xmm_mm_union {
148 __m128 xmm;
149 __m64 mm[2];
150 } xmm_mm_union;
151
152 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
153 xmm_mm_union u; u.xmm = xmm_; \
154 mm0_ = u.mm[0]; \
155 mm1_ = u.mm[1]; \
156 }
157
158 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
159 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
160 }
161
162 #endif // USE_SSE2
163
164 /* natural logarithm computed for 4 simultaneous float
165 return NaN for x <= 0
166 */
167 v4sf log_ps(v4sf x) {
168 #ifdef USE_SSE2
169 v4si emm0;
170 #else
171 v2si mm0, mm1;
172 #endif
173 v4sf one = *(v4sf*)_ps_1;
174
175 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
176 v4sf e, mask, tmp, z, y;
177
178 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
179
180 #ifndef USE_SSE2
181 /* part 1: x = frexpf(x, &e); */
182 COPY_XMM_TO_MM(x, mm0, mm1);
183 mm0 = _mm_srli_pi32(mm0, 23);
184 mm1 = _mm_srli_pi32(mm1, 23);
185 #else
186 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
187 #endif
188 /* keep only the fractional part */
189 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
190 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
191
192 #ifndef USE_SSE2
193 /* now e=mm0:mm1 contain the really base-2 exponent */
194 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
195 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
196 e = _mm_cvtpi32x2_ps(mm0, mm1);
197 _mm_empty(); /* bye bye mmx */
198 #else
199 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
200 e = _mm_cvtepi32_ps(emm0);
201 #endif
202
203 e = _mm_add_ps(e, one);
204
205 /* part2:
206 if( x < SQRTHF ) {
207 e -= 1;
208 x = x + x - 1.0;
209 } else { x = x - 1.0; }
210 */
211
212 mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
213 tmp = _mm_and_ps(x, mask);
214 x = _mm_sub_ps(x, one);
215 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
216 x = _mm_add_ps(x, tmp);
217
218
219 z = _mm_mul_ps(x,x);
220
221 y = *(v4sf*)_ps_cephes_log_p0;
222 y = _mm_mul_ps(y, x);
223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
224 y = _mm_mul_ps(y, x);
225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
226 y = _mm_mul_ps(y, x);
227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
232 y = _mm_mul_ps(y, x);
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
234 y = _mm_mul_ps(y, x);
235 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
236 y = _mm_mul_ps(y, x);
237 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
238 y = _mm_mul_ps(y, x);
239
240 y = _mm_mul_ps(y, z);
241
242
243 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
244 y = _mm_add_ps(y, tmp);
245
246
247 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
248 y = _mm_sub_ps(y, tmp);
249
250 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
251 x = _mm_add_ps(x, y);
252 x = _mm_add_ps(x, tmp);
253 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
254 return x;
255 }
256
257 _PS_CONST(exp_hi, 88.3762626647949f);
258 _PS_CONST(exp_lo, -88.3762626647949f);
259
260 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
261 _PS_CONST(cephes_exp_C1, 0.693359375);
262 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
263
264 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
265 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
266 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
267 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
268 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
269 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
270
271 v4sf exp_ps(v4sf x) {
272 v4sf tmp = _mm_setzero_ps(), fx;
273 #ifdef USE_SSE2
274 v4si emm0;
275 #else
276 v2si mm0, mm1;
277 #endif
278 v4sf one = *(v4sf*)_ps_1;
279 v4sf mask, z, y, pow2n;
280
281 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
282 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
283
284 /* express exp(x) as exp(g + n*log(2)) */
285 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
286 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
287
288 /* how to perform a floorf with SSE: just below */
289 #ifndef USE_SSE2
290 /* step 1 : cast to int */
291 tmp = _mm_movehl_ps(tmp, fx);
292 mm0 = _mm_cvttps_pi32(fx);
293 mm1 = _mm_cvttps_pi32(tmp);
294 /* step 2 : cast back to float */
295 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
296 #else
297 emm0 = _mm_cvttps_epi32(fx);
298 tmp = _mm_cvtepi32_ps(emm0);
299 #endif
300 /* if greater, substract 1 */
301 mask = _mm_cmpgt_ps(tmp, fx);
302 mask = _mm_and_ps(mask, one);
303 fx = _mm_sub_ps(tmp, mask);
304
305 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
306 z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
307 x = _mm_sub_ps(x, tmp);
308 x = _mm_sub_ps(x, z);
309
310 z = _mm_mul_ps(x,x);
311
312 y = *(v4sf*)_ps_cephes_exp_p0;
313 y = _mm_mul_ps(y, x);
314 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
315 y = _mm_mul_ps(y, x);
316 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
317 y = _mm_mul_ps(y, x);
318 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
319 y = _mm_mul_ps(y, x);
320 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
321 y = _mm_mul_ps(y, x);
322 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
323 y = _mm_mul_ps(y, z);
324 y = _mm_add_ps(y, x);
325 y = _mm_add_ps(y, one);
326
327 /* build 2^n */
328 #ifndef USE_SSE2
329 z = _mm_movehl_ps(z, fx);
330 mm0 = _mm_cvttps_pi32(fx);
331 mm1 = _mm_cvttps_pi32(z);
332 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
333 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
334 mm0 = _mm_slli_pi32(mm0, 23);
335 mm1 = _mm_slli_pi32(mm1, 23);
336
337 COPY_MM_TO_XMM(mm0, mm1, pow2n);
338 _mm_empty();
339 #else
340 emm0 = _mm_cvttps_epi32(fx);
341 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
342 emm0 = _mm_slli_epi32(emm0, 23);
343 pow2n = _mm_castsi128_ps(emm0);
344 #endif
345 y = _mm_mul_ps(y, pow2n);
346 return y;
347 }
348
349 _PS_CONST(minus_cephes_DP1, -0.78515625);
350 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
351 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
352 _PS_CONST(sincof_p0, -1.9515295891E-4);
353 _PS_CONST(sincof_p1, 8.3321608736E-3);
354 _PS_CONST(sincof_p2, -1.6666654611E-1);
355 _PS_CONST(coscof_p0, 2.443315711809948E-005);
356 _PS_CONST(coscof_p1, -1.388731625493765E-003);
357 _PS_CONST(coscof_p2, 4.166664568298827E-002);
358 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
359
360
361 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
362 it runs also on old athlons XPs and the pentium III of your grand
363 mother.
364
365 The code is the exact rewriting of the cephes sinf function.
366 Precision is excellent as long as x < 8192 (I did not bother to
367 take into account the special handling they have for greater values
368 -- it does not return garbage for arguments over 8192, though, but
369 the extra precision is missing).
370
371 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
372 surprising but correct result.
373
374 Performance is also surprisingly good, 1.33 times faster than the
375 macos vsinf SSE2 function, and 1.5 times faster than the
376 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
377 too bad for an SSE1 function (with no special tuning) !
378 However the latter libraries probably have a much better handling of NaN,
379 Inf, denormalized and other special arguments..
380
381 On my core 1 duo, the execution of this function takes approximately 95 cycles.
382
383 From what I have observed on the experiments with Intel AMath lib, switching to an
384 SSE2 version would improve the perf by only 10%.
385
386 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
387 deliver full speed.
388 */
389 v4sf sin_ps(v4sf x) { // any x
390 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
391
392 #ifdef USE_SSE2
393 v4si emm0, emm2;
394 #else
395 v2si mm0, mm1, mm2, mm3;
396 #endif
397 v4sf swap_sign_bit, poly_mask, z, tmp, y2;
398
399 sign_bit = x;
400 /* take the absolute value */
401 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
402 /* extract the sign bit (upper one) */
403 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
404
405 /* scale by 4/Pi */
406 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
407
408 //printf("plop:"); print4(y);
409 #ifdef USE_SSE2
410 /* store the integer part of y in mm0 */
411 emm2 = _mm_cvttps_epi32(y);
412 /* j=(j+1) & (~1) (see the cephes sources) */
413 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
414 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
415 y = _mm_cvtepi32_ps(emm2);
416 /* get the swap sign flag */
417 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
418 emm0 = _mm_slli_epi32(emm0, 29);
419 /* get the polynom selection mask
420 there is one polynom for 0 <= x <= Pi/4
421 and another one for Pi/4<x<=Pi/2
422
423 Both branches will be computed.
424 */
425 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
426 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
427
428 swap_sign_bit = _mm_castsi128_ps(emm0);
429 poly_mask = _mm_castsi128_ps(emm2);
430 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
431 #else
432 /* store the integer part of y in mm0:mm1 */
433 xmm2 = _mm_movehl_ps(xmm2, y);
434 mm2 = _mm_cvttps_pi32(y);
435 mm3 = _mm_cvttps_pi32(xmm2);
436 /* j=(j+1) & (~1) (see the cephes sources) */
437 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
438 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
439 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
440 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
441 y = _mm_cvtpi32x2_ps(mm2, mm3);
442 /* get the swap sign flag */
443 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
444 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
445 mm0 = _mm_slli_pi32(mm0, 29);
446 mm1 = _mm_slli_pi32(mm1, 29);
447 /* get the polynom selection mask */
448 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
449 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
450 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
451 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
452
453 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
454 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
455 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
456 _mm_empty(); /* good-bye mmx */
457 #endif
458
459 /* The magic pass: "Extended precision modular arithmetic"
460 x = ((x - y * DP1) - y * DP2) - y * DP3; */
461 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
462 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
463 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
464 xmm1 = _mm_mul_ps(y, xmm1);
465 xmm2 = _mm_mul_ps(y, xmm2);
466 xmm3 = _mm_mul_ps(y, xmm3);
467 x = _mm_add_ps(x, xmm1);
468 x = _mm_add_ps(x, xmm2);
469 x = _mm_add_ps(x, xmm3);
470
471 /* Evaluate the first polynom (0 <= x <= Pi/4) */
472 y = *(v4sf*)_ps_coscof_p0;
473 z = _mm_mul_ps(x,x);
474
475 y = _mm_mul_ps(y, z);
476 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
477 y = _mm_mul_ps(y, z);
478 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
479 y = _mm_mul_ps(y, z);
480 y = _mm_mul_ps(y, z);
481 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
482 y = _mm_sub_ps(y, tmp);
483 y = _mm_add_ps(y, *(v4sf*)_ps_1);
484
485 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
486
487 y2 = *(v4sf*)_ps_sincof_p0;
488 y2 = _mm_mul_ps(y2, z);
489 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
490 y2 = _mm_mul_ps(y2, z);
491 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
492 y2 = _mm_mul_ps(y2, z);
493 y2 = _mm_mul_ps(y2, x);
494 y2 = _mm_add_ps(y2, x);
495
496 /* select the correct result from the two polynoms */
497 xmm3 = poly_mask;
498 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
499 y = _mm_andnot_ps(xmm3, y);
500 y = _mm_add_ps(y,y2);
501 /* update the sign */
502 y = _mm_xor_ps(y, sign_bit);
503
504 return y;
505 }
506
507 /* almost the same as sin_ps */
508 v4sf cos_ps(v4sf x) { // any x
509 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
510 #ifdef USE_SSE2
511 v4si emm0, emm2;
512 #else
513 v2si mm0, mm1, mm2, mm3;
514 #endif
515 v4sf sign_bit, poly_mask, z, tmp, y2;
516
517 /* take the absolute value */
518 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
519
520 /* scale by 4/Pi */
521 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
522
523 #ifdef USE_SSE2
524 /* store the integer part of y in mm0 */
525 emm2 = _mm_cvttps_epi32(y);
526 /* j=(j+1) & (~1) (see the cephes sources) */
527 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
528 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
529 y = _mm_cvtepi32_ps(emm2);
530
531 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
532
533 /* get the swap sign flag */
534 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
535 emm0 = _mm_slli_epi32(emm0, 29);
536 /* get the polynom selection mask */
537 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
538 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
539
540 sign_bit = _mm_castsi128_ps(emm0);
541 poly_mask = _mm_castsi128_ps(emm2);
542 #else
543 /* store the integer part of y in mm0:mm1 */
544 xmm2 = _mm_movehl_ps(xmm2, y);
545 mm2 = _mm_cvttps_pi32(y);
546 mm3 = _mm_cvttps_pi32(xmm2);
547
548 /* j=(j+1) & (~1) (see the cephes sources) */
549 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
550 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
551 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
552 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
553
554 y = _mm_cvtpi32x2_ps(mm2, mm3);
555
556
557 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
558 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
559
560 /* get the swap sign flag in mm0:mm1 and the
561 polynom selection mask in mm2:mm3 */
562
563 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
564 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
565 mm0 = _mm_slli_pi32(mm0, 29);
566 mm1 = _mm_slli_pi32(mm1, 29);
567
568 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
569 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
570
571 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
572 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
573
574 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
575 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
576 _mm_empty(); /* good-bye mmx */
577 #endif
578 /* The magic pass: "Extended precision modular arithmetic"
579 x = ((x - y * DP1) - y * DP2) - y * DP3; */
580 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
581 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
582 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
583 xmm1 = _mm_mul_ps(y, xmm1);
584 xmm2 = _mm_mul_ps(y, xmm2);
585 xmm3 = _mm_mul_ps(y, xmm3);
586 x = _mm_add_ps(x, xmm1);
587 x = _mm_add_ps(x, xmm2);
588 x = _mm_add_ps(x, xmm3);
589
590 /* Evaluate the first polynom (0 <= x <= Pi/4) */
591 y = *(v4sf*)_ps_coscof_p0;
592 z = _mm_mul_ps(x,x);
593
594 y = _mm_mul_ps(y, z);
595 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
596 y = _mm_mul_ps(y, z);
597 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
598 y = _mm_mul_ps(y, z);
599 y = _mm_mul_ps(y, z);
600 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
601 y = _mm_sub_ps(y, tmp);
602 y = _mm_add_ps(y, *(v4sf*)_ps_1);
603
604 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
605
606 y2 = *(v4sf*)_ps_sincof_p0;
607 y2 = _mm_mul_ps(y2, z);
608 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
609 y2 = _mm_mul_ps(y2, z);
610 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
611 y2 = _mm_mul_ps(y2, z);
612 y2 = _mm_mul_ps(y2, x);
613 y2 = _mm_add_ps(y2, x);
614
615 /* select the correct result from the two polynoms */
616 xmm3 = poly_mask;
617 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
618 y = _mm_andnot_ps(xmm3, y);
619 y = _mm_add_ps(y,y2);
620 /* update the sign */
621 y = _mm_xor_ps(y, sign_bit);
622
623 return y;
624 }
625
626 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
627 it is almost as fast, and gives you a free cosine with your sine */
628 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
629 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
630 #ifdef USE_SSE2
631 v4si emm0, emm2, emm4;
632 #else
633 v2si mm0, mm1, mm2, mm3, mm4, mm5;
634 #endif
635 v4sf swap_sign_bit_sin, poly_mask, z, tmp, y2, ysin1, ysin2;
636 v4sf sign_bit_cos;
637
638 sign_bit_sin = x;
639 /* take the absolute value */
640 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
641 /* extract the sign bit (upper one) */
642 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
643
644 /* scale by 4/Pi */
645 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
646
647 #ifdef USE_SSE2
648 /* store the integer part of y in emm2 */
649 emm2 = _mm_cvttps_epi32(y);
650
651 /* j=(j+1) & (~1) (see the cephes sources) */
652 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
653 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
654 y = _mm_cvtepi32_ps(emm2);
655
656 emm4 = emm2;
657
658 /* get the swap sign flag for the sine */
659 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
660 emm0 = _mm_slli_epi32(emm0, 29);
661 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
662
663 /* get the polynom selection mask for the sine*/
664 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
665 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
666 poly_mask = _mm_castsi128_ps(emm2);
667 #else
668 /* store the integer part of y in mm2:mm3 */
669 xmm3 = _mm_movehl_ps(xmm3, y);
670 mm2 = _mm_cvttps_pi32(y);
671 mm3 = _mm_cvttps_pi32(xmm3);
672
673 /* j=(j+1) & (~1) (see the cephes sources) */
674 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
675 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
676 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
677 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
678
679 y = _mm_cvtpi32x2_ps(mm2, mm3);
680
681 mm4 = mm2;
682 mm5 = mm3;
683
684 /* get the swap sign flag for the sine */
685 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
686 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
687 mm0 = _mm_slli_pi32(mm0, 29);
688 mm1 = _mm_slli_pi32(mm1, 29);
689
690 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
691
692 /* get the polynom selection mask for the sine */
693
694 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
695 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
696 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
697 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
698
699 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
700 #endif
701
702 /* The magic pass: "Extended precision modular arithmetic"
703 x = ((x - y * DP1) - y * DP2) - y * DP3; */
704 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
705 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
706 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
707 xmm1 = _mm_mul_ps(y, xmm1);
708 xmm2 = _mm_mul_ps(y, xmm2);
709 xmm3 = _mm_mul_ps(y, xmm3);
710 x = _mm_add_ps(x, xmm1);
711 x = _mm_add_ps(x, xmm2);
712 x = _mm_add_ps(x, xmm3);
713
714 #ifdef USE_SSE2
715 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
716 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
717 emm4 = _mm_slli_epi32(emm4, 29);
718 sign_bit_cos = _mm_castsi128_ps(emm4);
719 #else
720 /* get the sign flag for the cosine */
721 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
722 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
723 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
724 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
725 mm4 = _mm_slli_pi32(mm4, 29);
726 mm5 = _mm_slli_pi32(mm5, 29);
727 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
728 _mm_empty(); /* good-bye mmx */
729 #endif
730
731 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
732
733
734 /* Evaluate the first polynom (0 <= x <= Pi/4) */
735 z = _mm_mul_ps(x,x);
736 y = *(v4sf*)_ps_coscof_p0;
737
738 y = _mm_mul_ps(y, z);
739 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
740 y = _mm_mul_ps(y, z);
741 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
742 y = _mm_mul_ps(y, z);
743 y = _mm_mul_ps(y, z);
744 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
745 y = _mm_sub_ps(y, tmp);
746 y = _mm_add_ps(y, *(v4sf*)_ps_1);
747
748 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
749
750 y2 = *(v4sf*)_ps_sincof_p0;
751 y2 = _mm_mul_ps(y2, z);
752 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
753 y2 = _mm_mul_ps(y2, z);
754 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
755 y2 = _mm_mul_ps(y2, z);
756 y2 = _mm_mul_ps(y2, x);
757 y2 = _mm_add_ps(y2, x);
758
759 /* select the correct result from the two polynoms */
760 xmm3 = poly_mask;
761 ysin2 = _mm_and_ps(xmm3, y2);
762 ysin1 = _mm_andnot_ps(xmm3, y);
763 y2 = _mm_sub_ps(y2,ysin2);
764 y = _mm_sub_ps(y, ysin1);
765
766 xmm1 = _mm_add_ps(ysin1,ysin2);
767 xmm2 = _mm_add_ps(y,y2);
768
769 /* update the sign */
770 *s = _mm_xor_ps(xmm1, sign_bit_sin);
771 *c = _mm_xor_ps(xmm2, sign_bit_cos);
772 }
773