src/gallium/drivers/llvmpipe/sse_mathfun.h

   1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
   2
   3    Inspired by Intel Approximate Math library, and based on the
   4    corresponding algorithms of the cephes math library
   5
   6    The default is to use the SSE1 version. If you define USE_SSE2 the
   7    the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
   8    not expect any significant performance improvement with SSE2.
   9 */
  10
  11 /* Copyright (C) 2007  Julien Pommier
  12
  13   This software is provided 'as-is', without any express or implied
  14   warranty.  In no event will the authors be held liable for any damages
  15   arising from the use of this software.
  16
  17   Permission is granted to anyone to use this software for any purpose,
  18   including commercial applications, and to alter it and redistribute it
  19   freely, subject to the following restrictions:
  20
  21   1. The origin of this software must not be misrepresented; you must not
  22      claim that you wrote the original software. If you use this software
  23      in a product, an acknowledgment in the product documentation would be
  24      appreciated but is not required.
  25   2. Altered source versions must be plainly marked as such, and must not be
  26      misrepresented as being the original software.
  27   3. This notice may not be removed or altered from any source distribution.
  28
  29   (this is the zlib license)
  30 */
  31
  32 #include <xmmintrin.h>
  33
  34 /* yes I know, the top of this file is quite ugly */
  35
  36 #ifdef _MSC_VER /* visual c++ */
  37 # define ALIGN16_BEG __declspec(align(16))
  38 # define ALIGN16_END
  39 #else /* gcc or icc */
  40 # define ALIGN16_BEG
  41 # define ALIGN16_END __attribute__((aligned(16)))
  42 #endif
  43
  44 /* __m128 is ugly to write */
  45 typedef __m128 v4sf;  // vector of 4 float (sse1)
  46
  47 #ifdef USE_SSE2
  48 # include <emmintrin.h>
  49 typedef __m128i v4si; // vector of 4 int (sse2)
  50 #else
  51 typedef __m64 v2si;   // vector of 2 int (mmx)
  52 #endif
  53
  54 /* declare some SSE constants -- why can't I figure a better way to do that? */
  55 #define _PS_CONST(Name, Val)                                            \
  56   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  57 #define _PI32_CONST(Name, Val)                                            \
  58   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  59 #define _PS_CONST_TYPE(Name, Type, Val)                                 \
  60   static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  61
  62 _PS_CONST(1  , 1.0f);
  63 _PS_CONST(0p5, 0.5f);
  64 /* the smallest non denormalized float number */
  65 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
  66 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
  67 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
  68
  69 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
  70 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
  71
  72 _PI32_CONST(1, 1);
  73 _PI32_CONST(inv1, ~1);
  74 _PI32_CONST(2, 2);
  75 _PI32_CONST(4, 4);
  76 _PI32_CONST(0x7f, 0x7f);
  77
  78 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
  79 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
  80 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
  81 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
  82 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
  83 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
  84 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
  85 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
  86 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
  87 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
  88 _PS_CONST(cephes_log_q1, -2.12194440e-4);
  89 _PS_CONST(cephes_log_q2, 0.693359375);
  90
  91 v4sf log_ps(v4sf x);
  92 v4sf exp_ps(v4sf x);
  93 v4sf sin_ps(v4sf x);
  94 v4sf cos_ps(v4sf x);
  95 void sincos_ps(v4sf x, v4sf *s, v4sf *c);
  96
  97 #if defined (__MINGW32__)
  98
  99 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
 100    The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
 101    broken on my mingw gcc 3.4.5 ...
 102
 103    Note that the bug on _mm_cmp* does occur only at -O0 optimization level
 104 */
 105
 106 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
 107         asm (
 108                         "movhlps %2,%0\n\t"
 109                         : "=x" (a)
 110                         : "0" (a), "x"(b)
 111             );
 112         return a;                                 }
 113 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
 114 #define _mm_movehl_ps my_movehl_ps
 115
 116 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
 117         asm (
 118                         "cmpltps %2,%0\n\t"
 119                         : "=x" (a)
 120                         : "0" (a), "x"(b)
 121             );
 122         return a;
 123                   }
 124 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
 125         asm (
 126                         "cmpnleps %2,%0\n\t"
 127                         : "=x" (a)
 128                         : "0" (a), "x"(b)
 129             );
 130         return a;
 131 }
 132 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
 133         asm (
 134                         "cmpeqps %2,%0\n\t"
 135                         : "=x" (a)
 136                         : "0" (a), "x"(b)
 137             );
 138         return a;
 139 }
 140 #warning "redefined _mm_cmpxx_ps functions..."
 141 #define _mm_cmplt_ps my_cmplt_ps
 142 #define _mm_cmpgt_ps my_cmpgt_ps
 143 #define _mm_cmpeq_ps my_cmpeq_ps
 144 #endif
 145
 146 #ifndef USE_SSE2
 147 typedef union xmm_mm_union {
 148   __m128 xmm;
 149   __m64 mm[2];
 150 } xmm_mm_union;
 151
 152 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
 153     xmm_mm_union u; u.xmm = xmm_;                   \
 154     mm0_ = u.mm[0];                                 \
 155     mm1_ = u.mm[1];                                 \
 156 }
 157
 158 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
 159     xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
 160   }
 161
 162 #endif // USE_SSE2
 163
 164 /* natural logarithm computed for 4 simultaneous float
 165    return NaN for x <= 0
 166 */
 167 v4sf log_ps(v4sf x) {
 168 #ifdef USE_SSE2
 169   v4si emm0;
 170 #else
 171   v2si mm0, mm1;
 172 #endif
 173   v4sf one = *(v4sf*)_ps_1;
 174
 175   v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
 176   v4sf e, mask, tmp, z, y;
 177
 178   x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
 179
 180 #ifndef USE_SSE2
 181   /* part 1: x = frexpf(x, &e); */
 182   COPY_XMM_TO_MM(x, mm0, mm1);
 183   mm0 = _mm_srli_pi32(mm0, 23);
 184   mm1 = _mm_srli_pi32(mm1, 23);
 185 #else
 186   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
 187 #endif
 188   /* keep only the fractional part */
 189   x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
 190   x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
 191
 192 #ifndef USE_SSE2
 193   /* now e=mm0:mm1 contain the really base-2 exponent */
 194   mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
 195   mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
 196   e = _mm_cvtpi32x2_ps(mm0, mm1);
 197   _mm_empty(); /* bye bye mmx */
 198 #else
 199   emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
 200   e = _mm_cvtepi32_ps(emm0);
 201 #endif
 202
 203   e = _mm_add_ps(e, one);
 204
 205   /* part2:
 206      if( x < SQRTHF ) {
 207        e -= 1;
 208        x = x + x - 1.0;
 209      } else { x = x - 1.0; }
 210   */
 211
 212   mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
 213   tmp = _mm_and_ps(x, mask);
 214   x = _mm_sub_ps(x, one);
 215   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
 216   x = _mm_add_ps(x, tmp);
 217
 218
 219   z = _mm_mul_ps(x,x);
 220
 221   y = *(v4sf*)_ps_cephes_log_p0;
 222   y = _mm_mul_ps(y, x);
 223   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
 224   y = _mm_mul_ps(y, x);
 225   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
 226   y = _mm_mul_ps(y, x);
 227   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
 228   y = _mm_mul_ps(y, x);
 229   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
 230   y = _mm_mul_ps(y, x);
 231   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
 232   y = _mm_mul_ps(y, x);
 233   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
 234   y = _mm_mul_ps(y, x);
 235   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
 236   y = _mm_mul_ps(y, x);
 237   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
 238   y = _mm_mul_ps(y, x);
 239
 240   y = _mm_mul_ps(y, z);
 241
 242
 243   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
 244   y = _mm_add_ps(y, tmp);
 245
 246
 247   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 248   y = _mm_sub_ps(y, tmp);
 249
 250   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
 251   x = _mm_add_ps(x, y);
 252   x = _mm_add_ps(x, tmp);
 253   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
 254   return x;
 255 }
 256
 257 _PS_CONST(exp_hi,       88.3762626647949f);
 258 _PS_CONST(exp_lo,       -88.3762626647949f);
 259
 260 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
 261 _PS_CONST(cephes_exp_C1, 0.693359375);
 262 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
 263
 264 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
 265 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
 266 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
 267 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
 268 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
 269 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
 270
 271 v4sf exp_ps(v4sf x) {
 272   v4sf tmp = _mm_setzero_ps(), fx;
 273 #ifdef USE_SSE2
 274   v4si emm0;
 275 #else
 276   v2si mm0, mm1;
 277 #endif
 278   v4sf one = *(v4sf*)_ps_1;
 279   v4sf mask, z, y, pow2n;
 280
 281   x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
 282   x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
 283
 284   /* express exp(x) as exp(g + n*log(2)) */
 285   fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
 286   fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
 287
 288   /* how to perform a floorf with SSE: just below */
 289 #ifndef USE_SSE2
 290   /* step 1 : cast to int */
 291   tmp = _mm_movehl_ps(tmp, fx);
 292   mm0 = _mm_cvttps_pi32(fx);
 293   mm1 = _mm_cvttps_pi32(tmp);
 294   /* step 2 : cast back to float */
 295   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
 296 #else
 297   emm0 = _mm_cvttps_epi32(fx);
 298   tmp  = _mm_cvtepi32_ps(emm0);
 299 #endif
 300   /* if greater, substract 1 */
 301   mask = _mm_cmpgt_ps(tmp, fx);
 302   mask = _mm_and_ps(mask, one);
 303   fx = _mm_sub_ps(tmp, mask);
 304
 305   tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
 306   z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
 307   x = _mm_sub_ps(x, tmp);
 308   x = _mm_sub_ps(x, z);
 309
 310   z = _mm_mul_ps(x,x);
 311
 312   y = *(v4sf*)_ps_cephes_exp_p0;
 313   y = _mm_mul_ps(y, x);
 314   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
 315   y = _mm_mul_ps(y, x);
 316   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
 317   y = _mm_mul_ps(y, x);
 318   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
 319   y = _mm_mul_ps(y, x);
 320   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
 321   y = _mm_mul_ps(y, x);
 322   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
 323   y = _mm_mul_ps(y, z);
 324   y = _mm_add_ps(y, x);
 325   y = _mm_add_ps(y, one);
 326
 327   /* build 2^n */
 328 #ifndef USE_SSE2
 329   z = _mm_movehl_ps(z, fx);
 330   mm0 = _mm_cvttps_pi32(fx);
 331   mm1 = _mm_cvttps_pi32(z);
 332   mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
 333   mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
 334   mm0 = _mm_slli_pi32(mm0, 23);
 335   mm1 = _mm_slli_pi32(mm1, 23);
 336
 337   COPY_MM_TO_XMM(mm0, mm1, pow2n);
 338   _mm_empty();
 339 #else
 340   emm0 = _mm_cvttps_epi32(fx);
 341   emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
 342   emm0 = _mm_slli_epi32(emm0, 23);
 343   pow2n = _mm_castsi128_ps(emm0);
 344 #endif
 345   y = _mm_mul_ps(y, pow2n);
 346   return y;
 347 }
 348
 349 _PS_CONST(minus_cephes_DP1, -0.78515625);
 350 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 351 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 352 _PS_CONST(sincof_p0, -1.9515295891E-4);
 353 _PS_CONST(sincof_p1,  8.3321608736E-3);
 354 _PS_CONST(sincof_p2, -1.6666654611E-1);
 355 _PS_CONST(coscof_p0,  2.443315711809948E-005);
 356 _PS_CONST(coscof_p1, -1.388731625493765E-003);
 357 _PS_CONST(coscof_p2,  4.166664568298827E-002);
 358 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
 359
 360
 361 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
 362    it runs also on old athlons XPs and the pentium III of your grand
 363    mother.
 364
 365    The code is the exact rewriting of the cephes sinf function.
 366    Precision is excellent as long as x < 8192 (I did not bother to
 367    take into account the special handling they have for greater values
 368    -- it does not return garbage for arguments over 8192, though, but
 369    the extra precision is missing).
 370
 371    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
 372    surprising but correct result.
 373
 374    Performance is also surprisingly good, 1.33 times faster than the
 375    macos vsinf SSE2 function, and 1.5 times faster than the
 376    __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
 377    too bad for an SSE1 function (with no special tuning) !
 378    However the latter libraries probably have a much better handling of NaN,
 379    Inf, denormalized and other special arguments..
 380
 381    On my core 1 duo, the execution of this function takes approximately 95 cycles.
 382
 383    From what I have observed on the experiments with Intel AMath lib, switching to an
 384    SSE2 version would improve the perf by only 10%.
 385
 386    Since it is based on SSE intrinsics, it has to be compiled at -O2 to
 387    deliver full speed.
 388 */
 389 v4sf sin_ps(v4sf x) { // any x
 390   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 391
 392 #ifdef USE_SSE2
 393   v4si emm0, emm2;
 394 #else
 395   v2si mm0, mm1, mm2, mm3;
 396 #endif
 397   v4sf swap_sign_bit, poly_mask, z, tmp, y2;
 398
 399   sign_bit = x;
 400   /* take the absolute value */
 401   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 402   /* extract the sign bit (upper one) */
 403   sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
 404
 405   /* scale by 4/Pi */
 406   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 407
 408   //printf("plop:"); print4(y);
 409 #ifdef USE_SSE2
 410   /* store the integer part of y in mm0 */
 411   emm2 = _mm_cvttps_epi32(y);
 412   /* j=(j+1) & (~1) (see the cephes sources) */
 413   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 414   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 415   y = _mm_cvtepi32_ps(emm2);
 416   /* get the swap sign flag */
 417   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 418   emm0 = _mm_slli_epi32(emm0, 29);
 419   /* get the polynom selection mask
 420      there is one polynom for 0 <= x <= Pi/4
 421      and another one for Pi/4<x<=Pi/2
 422
 423      Both branches will be computed.
 424   */
 425   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 426   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 427
 428   swap_sign_bit = _mm_castsi128_ps(emm0);
 429   poly_mask = _mm_castsi128_ps(emm2);
 430   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 431 #else
 432   /* store the integer part of y in mm0:mm1 */
 433   xmm2 = _mm_movehl_ps(xmm2, y);
 434   mm2 = _mm_cvttps_pi32(y);
 435   mm3 = _mm_cvttps_pi32(xmm2);
 436   /* j=(j+1) & (~1) (see the cephes sources) */
 437   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 438   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 439   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 440   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 441   y = _mm_cvtpi32x2_ps(mm2, mm3);
 442   /* get the swap sign flag */
 443   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 444   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 445   mm0 = _mm_slli_pi32(mm0, 29);
 446   mm1 = _mm_slli_pi32(mm1, 29);
 447   /* get the polynom selection mask */
 448   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 449   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 450   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 451   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 452
 453   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
 454   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 455   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 456   _mm_empty(); /* good-bye mmx */
 457 #endif
 458
 459   /* The magic pass: "Extended precision modular arithmetic"
 460      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 461   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 462   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 463   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 464   xmm1 = _mm_mul_ps(y, xmm1);
 465   xmm2 = _mm_mul_ps(y, xmm2);
 466   xmm3 = _mm_mul_ps(y, xmm3);
 467   x = _mm_add_ps(x, xmm1);
 468   x = _mm_add_ps(x, xmm2);
 469   x = _mm_add_ps(x, xmm3);
 470
 471   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 472   y = *(v4sf*)_ps_coscof_p0;
 473   z = _mm_mul_ps(x,x);
 474
 475   y = _mm_mul_ps(y, z);
 476   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 477   y = _mm_mul_ps(y, z);
 478   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 479   y = _mm_mul_ps(y, z);
 480   y = _mm_mul_ps(y, z);
 481   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 482   y = _mm_sub_ps(y, tmp);
 483   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 484
 485   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 486
 487   y2 = *(v4sf*)_ps_sincof_p0;
 488   y2 = _mm_mul_ps(y2, z);
 489   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 490   y2 = _mm_mul_ps(y2, z);
 491   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 492   y2 = _mm_mul_ps(y2, z);
 493   y2 = _mm_mul_ps(y2, x);
 494   y2 = _mm_add_ps(y2, x);
 495
 496   /* select the correct result from the two polynoms */
 497   xmm3 = poly_mask;
 498   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
 499   y = _mm_andnot_ps(xmm3, y);
 500   y = _mm_add_ps(y,y2);
 501   /* update the sign */
 502   y = _mm_xor_ps(y, sign_bit);
 503
 504   return y;
 505 }
 506
 507 /* almost the same as sin_ps */
 508 v4sf cos_ps(v4sf x) { // any x
 509   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
 510 #ifdef USE_SSE2
 511   v4si emm0, emm2;
 512 #else
 513   v2si mm0, mm1, mm2, mm3;
 514 #endif
 515   v4sf sign_bit, poly_mask, z, tmp, y2;
 516
 517   /* take the absolute value */
 518   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 519
 520   /* scale by 4/Pi */
 521   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 522
 523 #ifdef USE_SSE2
 524   /* store the integer part of y in mm0 */
 525   emm2 = _mm_cvttps_epi32(y);
 526   /* j=(j+1) & (~1) (see the cephes sources) */
 527   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 528   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 529   y = _mm_cvtepi32_ps(emm2);
 530
 531   emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
 532
 533   /* get the swap sign flag */
 534   emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
 535   emm0 = _mm_slli_epi32(emm0, 29);
 536   /* get the polynom selection mask */
 537   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 538   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 539
 540   sign_bit = _mm_castsi128_ps(emm0);
 541   poly_mask = _mm_castsi128_ps(emm2);
 542 #else
 543   /* store the integer part of y in mm0:mm1 */
 544   xmm2 = _mm_movehl_ps(xmm2, y);
 545   mm2 = _mm_cvttps_pi32(y);
 546   mm3 = _mm_cvttps_pi32(xmm2);
 547
 548   /* j=(j+1) & (~1) (see the cephes sources) */
 549   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 550   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 551   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 552   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 553
 554   y = _mm_cvtpi32x2_ps(mm2, mm3);
 555
 556
 557   mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
 558   mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
 559
 560   /* get the swap sign flag in mm0:mm1 and the
 561      polynom selection mask in mm2:mm3 */
 562
 563   mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
 564   mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
 565   mm0 = _mm_slli_pi32(mm0, 29);
 566   mm1 = _mm_slli_pi32(mm1, 29);
 567
 568   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 569   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 570
 571   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 572   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 573
 574   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
 575   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 576   _mm_empty(); /* good-bye mmx */
 577 #endif
 578   /* The magic pass: "Extended precision modular arithmetic"
 579      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 580   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 581   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 582   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 583   xmm1 = _mm_mul_ps(y, xmm1);
 584   xmm2 = _mm_mul_ps(y, xmm2);
 585   xmm3 = _mm_mul_ps(y, xmm3);
 586   x = _mm_add_ps(x, xmm1);
 587   x = _mm_add_ps(x, xmm2);
 588   x = _mm_add_ps(x, xmm3);
 589
 590   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 591   y = *(v4sf*)_ps_coscof_p0;
 592   z = _mm_mul_ps(x,x);
 593
 594   y = _mm_mul_ps(y, z);
 595   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 596   y = _mm_mul_ps(y, z);
 597   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 598   y = _mm_mul_ps(y, z);
 599   y = _mm_mul_ps(y, z);
 600   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 601   y = _mm_sub_ps(y, tmp);
 602   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 603
 604   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 605
 606   y2 = *(v4sf*)_ps_sincof_p0;
 607   y2 = _mm_mul_ps(y2, z);
 608   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 609   y2 = _mm_mul_ps(y2, z);
 610   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 611   y2 = _mm_mul_ps(y2, z);
 612   y2 = _mm_mul_ps(y2, x);
 613   y2 = _mm_add_ps(y2, x);
 614
 615   /* select the correct result from the two polynoms */
 616   xmm3 = poly_mask;
 617   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
 618   y = _mm_andnot_ps(xmm3, y);
 619   y = _mm_add_ps(y,y2);
 620   /* update the sign */
 621   y = _mm_xor_ps(y, sign_bit);
 622
 623   return y;
 624 }
 625
 626 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
 627    it is almost as fast, and gives you a free cosine with your sine */
 628 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
 629   v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
 630 #ifdef USE_SSE2
 631   v4si emm0, emm2, emm4;
 632 #else
 633   v2si mm0, mm1, mm2, mm3, mm4, mm5;
 634 #endif
 635   v4sf swap_sign_bit_sin, poly_mask, z, tmp, y2, ysin1, ysin2;
 636   v4sf sign_bit_cos;
 637
 638   sign_bit_sin = x;
 639   /* take the absolute value */
 640   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 641   /* extract the sign bit (upper one) */
 642   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
 643
 644   /* scale by 4/Pi */
 645   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 646
 647 #ifdef USE_SSE2
 648   /* store the integer part of y in emm2 */
 649   emm2 = _mm_cvttps_epi32(y);
 650
 651   /* j=(j+1) & (~1) (see the cephes sources) */
 652   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 653   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 654   y = _mm_cvtepi32_ps(emm2);
 655
 656   emm4 = emm2;
 657
 658   /* get the swap sign flag for the sine */
 659   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 660   emm0 = _mm_slli_epi32(emm0, 29);
 661   swap_sign_bit_sin = _mm_castsi128_ps(emm0);
 662
 663   /* get the polynom selection mask for the sine*/
 664   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 665   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 666   poly_mask = _mm_castsi128_ps(emm2);
 667 #else
 668   /* store the integer part of y in mm2:mm3 */
 669   xmm3 = _mm_movehl_ps(xmm3, y);
 670   mm2 = _mm_cvttps_pi32(y);
 671   mm3 = _mm_cvttps_pi32(xmm3);
 672
 673   /* j=(j+1) & (~1) (see the cephes sources) */
 674   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 675   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 676   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 677   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 678
 679   y = _mm_cvtpi32x2_ps(mm2, mm3);
 680
 681   mm4 = mm2;
 682   mm5 = mm3;
 683
 684   /* get the swap sign flag for the sine */
 685   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 686   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 687   mm0 = _mm_slli_pi32(mm0, 29);
 688   mm1 = _mm_slli_pi32(mm1, 29);
 689
 690   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
 691
 692   /* get the polynom selection mask for the sine */
 693
 694   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 695   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 696   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 697   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 698
 699   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 700 #endif
 701
 702   /* The magic pass: "Extended precision modular arithmetic"
 703      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 704   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 705   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 706   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 707   xmm1 = _mm_mul_ps(y, xmm1);
 708   xmm2 = _mm_mul_ps(y, xmm2);
 709   xmm3 = _mm_mul_ps(y, xmm3);
 710   x = _mm_add_ps(x, xmm1);
 711   x = _mm_add_ps(x, xmm2);
 712   x = _mm_add_ps(x, xmm3);
 713
 714 #ifdef USE_SSE2
 715   emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
 716   emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
 717   emm4 = _mm_slli_epi32(emm4, 29);
 718   sign_bit_cos = _mm_castsi128_ps(emm4);
 719 #else
 720   /* get the sign flag for the cosine */
 721   mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
 722   mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
 723   mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
 724   mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
 725   mm4 = _mm_slli_pi32(mm4, 29);
 726   mm5 = _mm_slli_pi32(mm5, 29);
 727   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
 728   _mm_empty(); /* good-bye mmx */
 729 #endif
 730
 731   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
 732
 733
 734   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 735   z = _mm_mul_ps(x,x);
 736   y = *(v4sf*)_ps_coscof_p0;
 737
 738   y = _mm_mul_ps(y, z);
 739   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 740   y = _mm_mul_ps(y, z);
 741   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 742   y = _mm_mul_ps(y, z);
 743   y = _mm_mul_ps(y, z);
 744   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 745   y = _mm_sub_ps(y, tmp);
 746   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 747
 748   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 749
 750   y2 = *(v4sf*)_ps_sincof_p0;
 751   y2 = _mm_mul_ps(y2, z);
 752   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 753   y2 = _mm_mul_ps(y2, z);
 754   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 755   y2 = _mm_mul_ps(y2, z);
 756   y2 = _mm_mul_ps(y2, x);
 757   y2 = _mm_add_ps(y2, x);
 758
 759   /* select the correct result from the two polynoms */
 760   xmm3 = poly_mask;
 761   ysin2 = _mm_and_ps(xmm3, y2);
 762   ysin1 = _mm_andnot_ps(xmm3, y);
 763   y2 = _mm_sub_ps(y2,ysin2);
 764   y = _mm_sub_ps(y, ysin1);
 765
 766   xmm1 = _mm_add_ps(ysin1,ysin2);
 767   xmm2 = _mm_add_ps(y,y2);
 768
 769   /* update the sign */
 770   *s = _mm_xor_ps(xmm1, sign_bit_sin);
 771   *c = _mm_xor_ps(xmm2, sign_bit_cos);
 772 }
 773