1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
3 Inspired by Intel Approximate Math library, and based on the
4 corresponding algorithms of the cephes math library
6 The default is to use the SSE1 version. If you define USE_SSE2 the
7 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8 not expect any significant performance improvement with SSE2.
11 /* Copyright (C) 2007 Julien Pommier
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
29 (this is the zlib license)
32 #include <xmmintrin.h>
34 /* yes I know, the top of this file is quite ugly */
36 #ifdef _MSC_VER /* visual c++ */
37 # define ALIGN16_BEG __declspec(align(16))
39 #else /* gcc or icc */
41 # define ALIGN16_END __attribute__((aligned(16)))
44 /* __m128 is ugly to write */
45 typedef __m128 v4sf
; // vector of 4 float (sse1)
48 # include <emmintrin.h>
49 typedef __m128i v4si
; // vector of 4 int (sse2)
51 typedef __m64 v2si
; // vector of 2 int (mmx)
54 /* declare some SSE constants -- why can't I figure a better way to do that? */
55 #define _PS_CONST(Name, Val) \
56 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
57 #define _PI32_CONST(Name, Val) \
58 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
59 #define _PS_CONST_TYPE(Name, Type, Val) \
60 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
64 /* the smallest non denormalized float number */
65 _PS_CONST_TYPE(min_norm_pos
, int, 0x00800000);
66 _PS_CONST_TYPE(mant_mask
, int, 0x7f800000);
67 _PS_CONST_TYPE(inv_mant_mask
, int, ~0x7f800000);
69 _PS_CONST_TYPE(sign_mask
, int, 0x80000000);
70 _PS_CONST_TYPE(inv_sign_mask
, int, ~0x80000000);
73 _PI32_CONST(inv1
, ~1);
76 _PI32_CONST(0x7f, 0x7f);
78 _PS_CONST(cephes_SQRTHF
, 0.707106781186547524);
79 _PS_CONST(cephes_log_p0
, 7.0376836292E-2);
80 _PS_CONST(cephes_log_p1
, - 1.1514610310E-1);
81 _PS_CONST(cephes_log_p2
, 1.1676998740E-1);
82 _PS_CONST(cephes_log_p3
, - 1.2420140846E-1);
83 _PS_CONST(cephes_log_p4
, + 1.4249322787E-1);
84 _PS_CONST(cephes_log_p5
, - 1.6668057665E-1);
85 _PS_CONST(cephes_log_p6
, + 2.0000714765E-1);
86 _PS_CONST(cephes_log_p7
, - 2.4999993993E-1);
87 _PS_CONST(cephes_log_p8
, + 3.3333331174E-1);
88 _PS_CONST(cephes_log_q1
, -2.12194440e-4);
89 _PS_CONST(cephes_log_q2
, 0.693359375);
95 void sincos_ps(v4sf x
, v4sf
*s
, v4sf
*c
);
97 #if defined (__MINGW32__)
99 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
100 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
101 broken on my mingw gcc 3.4.5 ...
103 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
106 inline __m128
my_movehl_ps(__m128 a
, const __m128 b
) {
113 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
114 #define _mm_movehl_ps my_movehl_ps
116 inline __m128
my_cmplt_ps(__m128 a
, const __m128 b
) {
124 inline __m128
my_cmpgt_ps(__m128 a
, const __m128 b
) {
132 inline __m128
my_cmpeq_ps(__m128 a
, const __m128 b
) {
140 #warning "redefined _mm_cmpxx_ps functions..."
141 #define _mm_cmplt_ps my_cmplt_ps
142 #define _mm_cmpgt_ps my_cmpgt_ps
143 #define _mm_cmpeq_ps my_cmpeq_ps
147 typedef union xmm_mm_union
{
152 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
153 xmm_mm_union u; u.xmm = xmm_; \
158 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
159 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
164 /* natural logarithm computed for 4 simultaneous float
165 return NaN for x <= 0
167 v4sf
log_ps(v4sf x
) {
173 v4sf one
= *(v4sf
*)_ps_1
;
175 v4sf invalid_mask
= _mm_cmple_ps(x
, _mm_setzero_ps());
176 v4sf e
, mask
, tmp
, z
, y
;
178 x
= _mm_max_ps(x
, *(v4sf
*)_ps_min_norm_pos
); /* cut off denormalized stuff */
181 /* part 1: x = frexpf(x, &e); */
182 COPY_XMM_TO_MM(x
, mm0
, mm1
);
183 mm0
= _mm_srli_pi32(mm0
, 23);
184 mm1
= _mm_srli_pi32(mm1
, 23);
186 emm0
= _mm_srli_epi32(_mm_castps_si128(x
), 23);
188 /* keep only the fractional part */
189 x
= _mm_and_ps(x
, *(v4sf
*)_ps_inv_mant_mask
);
190 x
= _mm_or_ps(x
, *(v4sf
*)_ps_0p5
);
193 /* now e=mm0:mm1 contain the really base-2 exponent */
194 mm0
= _mm_sub_pi32(mm0
, *(v2si
*)_pi32_0x7f
);
195 mm1
= _mm_sub_pi32(mm1
, *(v2si
*)_pi32_0x7f
);
196 e
= _mm_cvtpi32x2_ps(mm0
, mm1
);
197 _mm_empty(); /* bye bye mmx */
199 emm0
= _mm_sub_epi32(emm0
, *(v4si
*)_pi32_0x7f
);
200 e
= _mm_cvtepi32_ps(emm0
);
203 e
= _mm_add_ps(e
, one
);
209 } else { x = x - 1.0; }
212 mask
= _mm_cmplt_ps(x
, *(v4sf
*)_ps_cephes_SQRTHF
);
213 tmp
= _mm_and_ps(x
, mask
);
214 x
= _mm_sub_ps(x
, one
);
215 e
= _mm_sub_ps(e
, _mm_and_ps(one
, mask
));
216 x
= _mm_add_ps(x
, tmp
);
221 y
= *(v4sf
*)_ps_cephes_log_p0
;
222 y
= _mm_mul_ps(y
, x
);
223 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p1
);
224 y
= _mm_mul_ps(y
, x
);
225 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p2
);
226 y
= _mm_mul_ps(y
, x
);
227 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p3
);
228 y
= _mm_mul_ps(y
, x
);
229 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p4
);
230 y
= _mm_mul_ps(y
, x
);
231 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p5
);
232 y
= _mm_mul_ps(y
, x
);
233 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p6
);
234 y
= _mm_mul_ps(y
, x
);
235 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p7
);
236 y
= _mm_mul_ps(y
, x
);
237 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_log_p8
);
238 y
= _mm_mul_ps(y
, x
);
240 y
= _mm_mul_ps(y
, z
);
243 tmp
= _mm_mul_ps(e
, *(v4sf
*)_ps_cephes_log_q1
);
244 y
= _mm_add_ps(y
, tmp
);
247 tmp
= _mm_mul_ps(z
, *(v4sf
*)_ps_0p5
);
248 y
= _mm_sub_ps(y
, tmp
);
250 tmp
= _mm_mul_ps(e
, *(v4sf
*)_ps_cephes_log_q2
);
251 x
= _mm_add_ps(x
, y
);
252 x
= _mm_add_ps(x
, tmp
);
253 x
= _mm_or_ps(x
, invalid_mask
); // negative arg will be NAN
257 _PS_CONST(exp_hi
, 88.3762626647949f
);
258 _PS_CONST(exp_lo
, -88.3762626647949f
);
260 _PS_CONST(cephes_LOG2EF
, 1.44269504088896341);
261 _PS_CONST(cephes_exp_C1
, 0.693359375);
262 _PS_CONST(cephes_exp_C2
, -2.12194440e-4);
264 _PS_CONST(cephes_exp_p0
, 1.9875691500E-4);
265 _PS_CONST(cephes_exp_p1
, 1.3981999507E-3);
266 _PS_CONST(cephes_exp_p2
, 8.3334519073E-3);
267 _PS_CONST(cephes_exp_p3
, 4.1665795894E-2);
268 _PS_CONST(cephes_exp_p4
, 1.6666665459E-1);
269 _PS_CONST(cephes_exp_p5
, 5.0000001201E-1);
271 v4sf
exp_ps(v4sf x
) {
272 v4sf tmp
= _mm_setzero_ps(), fx
;
278 v4sf one
= *(v4sf
*)_ps_1
;
279 v4sf mask
, z
, y
, pow2n
;
281 x
= _mm_min_ps(x
, *(v4sf
*)_ps_exp_hi
);
282 x
= _mm_max_ps(x
, *(v4sf
*)_ps_exp_lo
);
284 /* express exp(x) as exp(g + n*log(2)) */
285 fx
= _mm_mul_ps(x
, *(v4sf
*)_ps_cephes_LOG2EF
);
286 fx
= _mm_add_ps(fx
, *(v4sf
*)_ps_0p5
);
288 /* how to perform a floorf with SSE: just below */
290 /* step 1 : cast to int */
291 tmp
= _mm_movehl_ps(tmp
, fx
);
292 mm0
= _mm_cvttps_pi32(fx
);
293 mm1
= _mm_cvttps_pi32(tmp
);
294 /* step 2 : cast back to float */
295 tmp
= _mm_cvtpi32x2_ps(mm0
, mm1
);
297 emm0
= _mm_cvttps_epi32(fx
);
298 tmp
= _mm_cvtepi32_ps(emm0
);
300 /* if greater, substract 1 */
301 mask
= _mm_cmpgt_ps(tmp
, fx
);
302 mask
= _mm_and_ps(mask
, one
);
303 fx
= _mm_sub_ps(tmp
, mask
);
305 tmp
= _mm_mul_ps(fx
, *(v4sf
*)_ps_cephes_exp_C1
);
306 z
= _mm_mul_ps(fx
, *(v4sf
*)_ps_cephes_exp_C2
);
307 x
= _mm_sub_ps(x
, tmp
);
308 x
= _mm_sub_ps(x
, z
);
312 y
= *(v4sf
*)_ps_cephes_exp_p0
;
313 y
= _mm_mul_ps(y
, x
);
314 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_exp_p1
);
315 y
= _mm_mul_ps(y
, x
);
316 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_exp_p2
);
317 y
= _mm_mul_ps(y
, x
);
318 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_exp_p3
);
319 y
= _mm_mul_ps(y
, x
);
320 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_exp_p4
);
321 y
= _mm_mul_ps(y
, x
);
322 y
= _mm_add_ps(y
, *(v4sf
*)_ps_cephes_exp_p5
);
323 y
= _mm_mul_ps(y
, z
);
324 y
= _mm_add_ps(y
, x
);
325 y
= _mm_add_ps(y
, one
);
329 z
= _mm_movehl_ps(z
, fx
);
330 mm0
= _mm_cvttps_pi32(fx
);
331 mm1
= _mm_cvttps_pi32(z
);
332 mm0
= _mm_add_pi32(mm0
, *(v2si
*)_pi32_0x7f
);
333 mm1
= _mm_add_pi32(mm1
, *(v2si
*)_pi32_0x7f
);
334 mm0
= _mm_slli_pi32(mm0
, 23);
335 mm1
= _mm_slli_pi32(mm1
, 23);
337 COPY_MM_TO_XMM(mm0
, mm1
, pow2n
);
340 emm0
= _mm_cvttps_epi32(fx
);
341 emm0
= _mm_add_epi32(emm0
, *(v4si
*)_pi32_0x7f
);
342 emm0
= _mm_slli_epi32(emm0
, 23);
343 pow2n
= _mm_castsi128_ps(emm0
);
345 y
= _mm_mul_ps(y
, pow2n
);
349 _PS_CONST(minus_cephes_DP1
, -0.78515625);
350 _PS_CONST(minus_cephes_DP2
, -2.4187564849853515625e-4);
351 _PS_CONST(minus_cephes_DP3
, -3.77489497744594108e-8);
352 _PS_CONST(sincof_p0
, -1.9515295891E-4);
353 _PS_CONST(sincof_p1
, 8.3321608736E-3);
354 _PS_CONST(sincof_p2
, -1.6666654611E-1);
355 _PS_CONST(coscof_p0
, 2.443315711809948E-005);
356 _PS_CONST(coscof_p1
, -1.388731625493765E-003);
357 _PS_CONST(coscof_p2
, 4.166664568298827E-002);
358 _PS_CONST(cephes_FOPI
, 1.27323954473516); // 4 / M_PI
361 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
362 it runs also on old athlons XPs and the pentium III of your grand
365 The code is the exact rewriting of the cephes sinf function.
366 Precision is excellent as long as x < 8192 (I did not bother to
367 take into account the special handling they have for greater values
368 -- it does not return garbage for arguments over 8192, though, but
369 the extra precision is missing).
371 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
372 surprising but correct result.
374 Performance is also surprisingly good, 1.33 times faster than the
375 macos vsinf SSE2 function, and 1.5 times faster than the
376 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
377 too bad for an SSE1 function (with no special tuning) !
378 However the latter libraries probably have a much better handling of NaN,
379 Inf, denormalized and other special arguments..
381 On my core 1 duo, the execution of this function takes approximately 95 cycles.
383 From what I have observed on the experiments with Intel AMath lib, switching to an
384 SSE2 version would improve the perf by only 10%.
386 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
389 v4sf
sin_ps(v4sf x
) { // any x
390 v4sf xmm1
, xmm2
= _mm_setzero_ps(), xmm3
, sign_bit
, y
;
395 v2si mm0
, mm1
, mm2
, mm3
;
397 v4sf swap_sign_bit
, poly_mask
, z
, tmp
, y2
;
400 /* take the absolute value */
401 x
= _mm_and_ps(x
, *(v4sf
*)_ps_inv_sign_mask
);
402 /* extract the sign bit (upper one) */
403 sign_bit
= _mm_and_ps(sign_bit
, *(v4sf
*)_ps_sign_mask
);
406 y
= _mm_mul_ps(x
, *(v4sf
*)_ps_cephes_FOPI
);
408 //printf("plop:"); print4(y);
410 /* store the integer part of y in mm0 */
411 emm2
= _mm_cvttps_epi32(y
);
412 /* j=(j+1) & (~1) (see the cephes sources) */
413 emm2
= _mm_add_epi32(emm2
, *(v4si
*)_pi32_1
);
414 emm2
= _mm_and_si128(emm2
, *(v4si
*)_pi32_inv1
);
415 y
= _mm_cvtepi32_ps(emm2
);
416 /* get the swap sign flag */
417 emm0
= _mm_and_si128(emm2
, *(v4si
*)_pi32_4
);
418 emm0
= _mm_slli_epi32(emm0
, 29);
419 /* get the polynom selection mask
420 there is one polynom for 0 <= x <= Pi/4
421 and another one for Pi/4<x<=Pi/2
423 Both branches will be computed.
425 emm2
= _mm_and_si128(emm2
, *(v4si
*)_pi32_2
);
426 emm2
= _mm_cmpeq_epi32(emm2
, _mm_setzero_si128());
428 swap_sign_bit
= _mm_castsi128_ps(emm0
);
429 poly_mask
= _mm_castsi128_ps(emm2
);
430 sign_bit
= _mm_xor_ps(sign_bit
, swap_sign_bit
);
432 /* store the integer part of y in mm0:mm1 */
433 xmm2
= _mm_movehl_ps(xmm2
, y
);
434 mm2
= _mm_cvttps_pi32(y
);
435 mm3
= _mm_cvttps_pi32(xmm2
);
436 /* j=(j+1) & (~1) (see the cephes sources) */
437 mm2
= _mm_add_pi32(mm2
, *(v2si
*)_pi32_1
);
438 mm3
= _mm_add_pi32(mm3
, *(v2si
*)_pi32_1
);
439 mm2
= _mm_and_si64(mm2
, *(v2si
*)_pi32_inv1
);
440 mm3
= _mm_and_si64(mm3
, *(v2si
*)_pi32_inv1
);
441 y
= _mm_cvtpi32x2_ps(mm2
, mm3
);
442 /* get the swap sign flag */
443 mm0
= _mm_and_si64(mm2
, *(v2si
*)_pi32_4
);
444 mm1
= _mm_and_si64(mm3
, *(v2si
*)_pi32_4
);
445 mm0
= _mm_slli_pi32(mm0
, 29);
446 mm1
= _mm_slli_pi32(mm1
, 29);
447 /* get the polynom selection mask */
448 mm2
= _mm_and_si64(mm2
, *(v2si
*)_pi32_2
);
449 mm3
= _mm_and_si64(mm3
, *(v2si
*)_pi32_2
);
450 mm2
= _mm_cmpeq_pi32(mm2
, _mm_setzero_si64());
451 mm3
= _mm_cmpeq_pi32(mm3
, _mm_setzero_si64());
453 COPY_MM_TO_XMM(mm0
, mm1
, swap_sign_bit
);
454 COPY_MM_TO_XMM(mm2
, mm3
, poly_mask
);
455 sign_bit
= _mm_xor_ps(sign_bit
, swap_sign_bit
);
456 _mm_empty(); /* good-bye mmx */
459 /* The magic pass: "Extended precision modular arithmetic"
460 x = ((x - y * DP1) - y * DP2) - y * DP3; */
461 xmm1
= *(v4sf
*)_ps_minus_cephes_DP1
;
462 xmm2
= *(v4sf
*)_ps_minus_cephes_DP2
;
463 xmm3
= *(v4sf
*)_ps_minus_cephes_DP3
;
464 xmm1
= _mm_mul_ps(y
, xmm1
);
465 xmm2
= _mm_mul_ps(y
, xmm2
);
466 xmm3
= _mm_mul_ps(y
, xmm3
);
467 x
= _mm_add_ps(x
, xmm1
);
468 x
= _mm_add_ps(x
, xmm2
);
469 x
= _mm_add_ps(x
, xmm3
);
471 /* Evaluate the first polynom (0 <= x <= Pi/4) */
472 y
= *(v4sf
*)_ps_coscof_p0
;
475 y
= _mm_mul_ps(y
, z
);
476 y
= _mm_add_ps(y
, *(v4sf
*)_ps_coscof_p1
);
477 y
= _mm_mul_ps(y
, z
);
478 y
= _mm_add_ps(y
, *(v4sf
*)_ps_coscof_p2
);
479 y
= _mm_mul_ps(y
, z
);
480 y
= _mm_mul_ps(y
, z
);
481 tmp
= _mm_mul_ps(z
, *(v4sf
*)_ps_0p5
);
482 y
= _mm_sub_ps(y
, tmp
);
483 y
= _mm_add_ps(y
, *(v4sf
*)_ps_1
);
485 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
487 y2
= *(v4sf
*)_ps_sincof_p0
;
488 y2
= _mm_mul_ps(y2
, z
);
489 y2
= _mm_add_ps(y2
, *(v4sf
*)_ps_sincof_p1
);
490 y2
= _mm_mul_ps(y2
, z
);
491 y2
= _mm_add_ps(y2
, *(v4sf
*)_ps_sincof_p2
);
492 y2
= _mm_mul_ps(y2
, z
);
493 y2
= _mm_mul_ps(y2
, x
);
494 y2
= _mm_add_ps(y2
, x
);
496 /* select the correct result from the two polynoms */
498 y2
= _mm_and_ps(xmm3
, y2
); //, xmm3);
499 y
= _mm_andnot_ps(xmm3
, y
);
500 y
= _mm_add_ps(y
,y2
);
501 /* update the sign */
502 y
= _mm_xor_ps(y
, sign_bit
);
507 /* almost the same as sin_ps */
508 v4sf
cos_ps(v4sf x
) { // any x
509 v4sf xmm1
, xmm2
= _mm_setzero_ps(), xmm3
, y
;
513 v2si mm0
, mm1
, mm2
, mm3
;
515 v4sf sign_bit
, poly_mask
, z
, tmp
, y2
;
517 /* take the absolute value */
518 x
= _mm_and_ps(x
, *(v4sf
*)_ps_inv_sign_mask
);
521 y
= _mm_mul_ps(x
, *(v4sf
*)_ps_cephes_FOPI
);
524 /* store the integer part of y in mm0 */
525 emm2
= _mm_cvttps_epi32(y
);
526 /* j=(j+1) & (~1) (see the cephes sources) */
527 emm2
= _mm_add_epi32(emm2
, *(v4si
*)_pi32_1
);
528 emm2
= _mm_and_si128(emm2
, *(v4si
*)_pi32_inv1
);
529 y
= _mm_cvtepi32_ps(emm2
);
531 emm2
= _mm_sub_epi32(emm2
, *(v4si
*)_pi32_2
);
533 /* get the swap sign flag */
534 emm0
= _mm_andnot_si128(emm2
, *(v4si
*)_pi32_4
);
535 emm0
= _mm_slli_epi32(emm0
, 29);
536 /* get the polynom selection mask */
537 emm2
= _mm_and_si128(emm2
, *(v4si
*)_pi32_2
);
538 emm2
= _mm_cmpeq_epi32(emm2
, _mm_setzero_si128());
540 sign_bit
= _mm_castsi128_ps(emm0
);
541 poly_mask
= _mm_castsi128_ps(emm2
);
543 /* store the integer part of y in mm0:mm1 */
544 xmm2
= _mm_movehl_ps(xmm2
, y
);
545 mm2
= _mm_cvttps_pi32(y
);
546 mm3
= _mm_cvttps_pi32(xmm2
);
548 /* j=(j+1) & (~1) (see the cephes sources) */
549 mm2
= _mm_add_pi32(mm2
, *(v2si
*)_pi32_1
);
550 mm3
= _mm_add_pi32(mm3
, *(v2si
*)_pi32_1
);
551 mm2
= _mm_and_si64(mm2
, *(v2si
*)_pi32_inv1
);
552 mm3
= _mm_and_si64(mm3
, *(v2si
*)_pi32_inv1
);
554 y
= _mm_cvtpi32x2_ps(mm2
, mm3
);
557 mm2
= _mm_sub_pi32(mm2
, *(v2si
*)_pi32_2
);
558 mm3
= _mm_sub_pi32(mm3
, *(v2si
*)_pi32_2
);
560 /* get the swap sign flag in mm0:mm1 and the
561 polynom selection mask in mm2:mm3 */
563 mm0
= _mm_andnot_si64(mm2
, *(v2si
*)_pi32_4
);
564 mm1
= _mm_andnot_si64(mm3
, *(v2si
*)_pi32_4
);
565 mm0
= _mm_slli_pi32(mm0
, 29);
566 mm1
= _mm_slli_pi32(mm1
, 29);
568 mm2
= _mm_and_si64(mm2
, *(v2si
*)_pi32_2
);
569 mm3
= _mm_and_si64(mm3
, *(v2si
*)_pi32_2
);
571 mm2
= _mm_cmpeq_pi32(mm2
, _mm_setzero_si64());
572 mm3
= _mm_cmpeq_pi32(mm3
, _mm_setzero_si64());
574 COPY_MM_TO_XMM(mm0
, mm1
, sign_bit
);
575 COPY_MM_TO_XMM(mm2
, mm3
, poly_mask
);
576 _mm_empty(); /* good-bye mmx */
578 /* The magic pass: "Extended precision modular arithmetic"
579 x = ((x - y * DP1) - y * DP2) - y * DP3; */
580 xmm1
= *(v4sf
*)_ps_minus_cephes_DP1
;
581 xmm2
= *(v4sf
*)_ps_minus_cephes_DP2
;
582 xmm3
= *(v4sf
*)_ps_minus_cephes_DP3
;
583 xmm1
= _mm_mul_ps(y
, xmm1
);
584 xmm2
= _mm_mul_ps(y
, xmm2
);
585 xmm3
= _mm_mul_ps(y
, xmm3
);
586 x
= _mm_add_ps(x
, xmm1
);
587 x
= _mm_add_ps(x
, xmm2
);
588 x
= _mm_add_ps(x
, xmm3
);
590 /* Evaluate the first polynom (0 <= x <= Pi/4) */
591 y
= *(v4sf
*)_ps_coscof_p0
;
594 y
= _mm_mul_ps(y
, z
);
595 y
= _mm_add_ps(y
, *(v4sf
*)_ps_coscof_p1
);
596 y
= _mm_mul_ps(y
, z
);
597 y
= _mm_add_ps(y
, *(v4sf
*)_ps_coscof_p2
);
598 y
= _mm_mul_ps(y
, z
);
599 y
= _mm_mul_ps(y
, z
);
600 tmp
= _mm_mul_ps(z
, *(v4sf
*)_ps_0p5
);
601 y
= _mm_sub_ps(y
, tmp
);
602 y
= _mm_add_ps(y
, *(v4sf
*)_ps_1
);
604 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
606 y2
= *(v4sf
*)_ps_sincof_p0
;
607 y2
= _mm_mul_ps(y2
, z
);
608 y2
= _mm_add_ps(y2
, *(v4sf
*)_ps_sincof_p1
);
609 y2
= _mm_mul_ps(y2
, z
);
610 y2
= _mm_add_ps(y2
, *(v4sf
*)_ps_sincof_p2
);
611 y2
= _mm_mul_ps(y2
, z
);
612 y2
= _mm_mul_ps(y2
, x
);
613 y2
= _mm_add_ps(y2
, x
);
615 /* select the correct result from the two polynoms */
617 y2
= _mm_and_ps(xmm3
, y2
); //, xmm3);
618 y
= _mm_andnot_ps(xmm3
, y
);
619 y
= _mm_add_ps(y
,y2
);
620 /* update the sign */
621 y
= _mm_xor_ps(y
, sign_bit
);
626 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
627 it is almost as fast, and gives you a free cosine with your sine */
628 void sincos_ps(v4sf x
, v4sf
*s
, v4sf
*c
) {
629 v4sf xmm1
, xmm2
, xmm3
= _mm_setzero_ps(), sign_bit_sin
, y
;
631 v4si emm0
, emm2
, emm4
;
633 v2si mm0
, mm1
, mm2
, mm3
, mm4
, mm5
;
635 v4sf swap_sign_bit_sin
, poly_mask
, z
, tmp
, y2
, ysin1
, ysin2
;
639 /* take the absolute value */
640 x
= _mm_and_ps(x
, *(v4sf
*)_ps_inv_sign_mask
);
641 /* extract the sign bit (upper one) */
642 sign_bit_sin
= _mm_and_ps(sign_bit_sin
, *(v4sf
*)_ps_sign_mask
);
645 y
= _mm_mul_ps(x
, *(v4sf
*)_ps_cephes_FOPI
);
648 /* store the integer part of y in emm2 */
649 emm2
= _mm_cvttps_epi32(y
);
651 /* j=(j+1) & (~1) (see the cephes sources) */
652 emm2
= _mm_add_epi32(emm2
, *(v4si
*)_pi32_1
);
653 emm2
= _mm_and_si128(emm2
, *(v4si
*)_pi32_inv1
);
654 y
= _mm_cvtepi32_ps(emm2
);
658 /* get the swap sign flag for the sine */
659 emm0
= _mm_and_si128(emm2
, *(v4si
*)_pi32_4
);
660 emm0
= _mm_slli_epi32(emm0
, 29);
661 swap_sign_bit_sin
= _mm_castsi128_ps(emm0
);
663 /* get the polynom selection mask for the sine*/
664 emm2
= _mm_and_si128(emm2
, *(v4si
*)_pi32_2
);
665 emm2
= _mm_cmpeq_epi32(emm2
, _mm_setzero_si128());
666 poly_mask
= _mm_castsi128_ps(emm2
);
668 /* store the integer part of y in mm2:mm3 */
669 xmm3
= _mm_movehl_ps(xmm3
, y
);
670 mm2
= _mm_cvttps_pi32(y
);
671 mm3
= _mm_cvttps_pi32(xmm3
);
673 /* j=(j+1) & (~1) (see the cephes sources) */
674 mm2
= _mm_add_pi32(mm2
, *(v2si
*)_pi32_1
);
675 mm3
= _mm_add_pi32(mm3
, *(v2si
*)_pi32_1
);
676 mm2
= _mm_and_si64(mm2
, *(v2si
*)_pi32_inv1
);
677 mm3
= _mm_and_si64(mm3
, *(v2si
*)_pi32_inv1
);
679 y
= _mm_cvtpi32x2_ps(mm2
, mm3
);
684 /* get the swap sign flag for the sine */
685 mm0
= _mm_and_si64(mm2
, *(v2si
*)_pi32_4
);
686 mm1
= _mm_and_si64(mm3
, *(v2si
*)_pi32_4
);
687 mm0
= _mm_slli_pi32(mm0
, 29);
688 mm1
= _mm_slli_pi32(mm1
, 29);
690 COPY_MM_TO_XMM(mm0
, mm1
, swap_sign_bit_sin
);
692 /* get the polynom selection mask for the sine */
694 mm2
= _mm_and_si64(mm2
, *(v2si
*)_pi32_2
);
695 mm3
= _mm_and_si64(mm3
, *(v2si
*)_pi32_2
);
696 mm2
= _mm_cmpeq_pi32(mm2
, _mm_setzero_si64());
697 mm3
= _mm_cmpeq_pi32(mm3
, _mm_setzero_si64());
699 COPY_MM_TO_XMM(mm2
, mm3
, poly_mask
);
702 /* The magic pass: "Extended precision modular arithmetic"
703 x = ((x - y * DP1) - y * DP2) - y * DP3; */
704 xmm1
= *(v4sf
*)_ps_minus_cephes_DP1
;
705 xmm2
= *(v4sf
*)_ps_minus_cephes_DP2
;
706 xmm3
= *(v4sf
*)_ps_minus_cephes_DP3
;
707 xmm1
= _mm_mul_ps(y
, xmm1
);
708 xmm2
= _mm_mul_ps(y
, xmm2
);
709 xmm3
= _mm_mul_ps(y
, xmm3
);
710 x
= _mm_add_ps(x
, xmm1
);
711 x
= _mm_add_ps(x
, xmm2
);
712 x
= _mm_add_ps(x
, xmm3
);
715 emm4
= _mm_sub_epi32(emm4
, *(v4si
*)_pi32_2
);
716 emm4
= _mm_andnot_si128(emm4
, *(v4si
*)_pi32_4
);
717 emm4
= _mm_slli_epi32(emm4
, 29);
718 sign_bit_cos
= _mm_castsi128_ps(emm4
);
720 /* get the sign flag for the cosine */
721 mm4
= _mm_sub_pi32(mm4
, *(v2si
*)_pi32_2
);
722 mm5
= _mm_sub_pi32(mm5
, *(v2si
*)_pi32_2
);
723 mm4
= _mm_andnot_si64(mm4
, *(v2si
*)_pi32_4
);
724 mm5
= _mm_andnot_si64(mm5
, *(v2si
*)_pi32_4
);
725 mm4
= _mm_slli_pi32(mm4
, 29);
726 mm5
= _mm_slli_pi32(mm5
, 29);
727 COPY_MM_TO_XMM(mm4
, mm5
, sign_bit_cos
);
728 _mm_empty(); /* good-bye mmx */
731 sign_bit_sin
= _mm_xor_ps(sign_bit_sin
, swap_sign_bit_sin
);
734 /* Evaluate the first polynom (0 <= x <= Pi/4) */
736 y
= *(v4sf
*)_ps_coscof_p0
;
738 y
= _mm_mul_ps(y
, z
);
739 y
= _mm_add_ps(y
, *(v4sf
*)_ps_coscof_p1
);
740 y
= _mm_mul_ps(y
, z
);
741 y
= _mm_add_ps(y
, *(v4sf
*)_ps_coscof_p2
);
742 y
= _mm_mul_ps(y
, z
);
743 y
= _mm_mul_ps(y
, z
);
744 tmp
= _mm_mul_ps(z
, *(v4sf
*)_ps_0p5
);
745 y
= _mm_sub_ps(y
, tmp
);
746 y
= _mm_add_ps(y
, *(v4sf
*)_ps_1
);
748 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
750 y2
= *(v4sf
*)_ps_sincof_p0
;
751 y2
= _mm_mul_ps(y2
, z
);
752 y2
= _mm_add_ps(y2
, *(v4sf
*)_ps_sincof_p1
);
753 y2
= _mm_mul_ps(y2
, z
);
754 y2
= _mm_add_ps(y2
, *(v4sf
*)_ps_sincof_p2
);
755 y2
= _mm_mul_ps(y2
, z
);
756 y2
= _mm_mul_ps(y2
, x
);
757 y2
= _mm_add_ps(y2
, x
);
759 /* select the correct result from the two polynoms */
761 ysin2
= _mm_and_ps(xmm3
, y2
);
762 ysin1
= _mm_andnot_ps(xmm3
, y
);
763 y2
= _mm_sub_ps(y2
,ysin2
);
764 y
= _mm_sub_ps(y
, ysin1
);
766 xmm1
= _mm_add_ps(ysin1
,ysin2
);
767 xmm2
= _mm_add_ps(y
,y2
);
769 /* update the sign */
770 *s
= _mm_xor_ps(xmm1
, sign_bit_sin
);
771 *c
= _mm_xor_ps(xmm2
, sign_bit_cos
);