llvmpipe: Remove some broken MinGW hacks in the sin/cos reference code.
[mesa.git] / src / gallium / drivers / llvmpipe / sse_mathfun.h
1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2
3 Inspired by Intel Approximate Math library, and based on the
4 corresponding algorithms of the cephes math library
5
6 The default is to use the SSE1 version. If you define USE_SSE2 the
7 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8 not expect any significant performance improvement with SSE2.
9 */
10
11 /* Copyright (C) 2007 Julien Pommier
12
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
16
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
20
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
28
29 (this is the zlib license)
30 */
31
32 #include <xmmintrin.h>
33
34 /* yes I know, the top of this file is quite ugly */
35
36 #ifdef _MSC_VER /* visual c++ */
37 # define ALIGN16_BEG __declspec(align(16))
38 # define ALIGN16_END
39 #else /* gcc or icc */
40 # define ALIGN16_BEG
41 # define ALIGN16_END __attribute__((aligned(16)))
42 #endif
43
44 /* __m128 is ugly to write */
45 typedef __m128 v4sf; // vector of 4 float (sse1)
46
47 #ifdef USE_SSE2
48 # include <emmintrin.h>
49 typedef __m128i v4si; // vector of 4 int (sse2)
50 #else
51 typedef __m64 v2si; // vector of 2 int (mmx)
52 #endif
53
54 /* declare some SSE constants -- why can't I figure a better way to do that? */
55 #define _PS_CONST(Name, Val) \
56 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
57 #define _PI32_CONST(Name, Val) \
58 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
59 #define _PS_CONST_TYPE(Name, Type, Val) \
60 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
61
62 _PS_CONST(1 , 1.0f);
63 _PS_CONST(0p5, 0.5f);
64 /* the smallest non denormalized float number */
65 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
66 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
67 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
68
69 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
70 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
71
72 _PI32_CONST(1, 1);
73 _PI32_CONST(inv1, ~1);
74 _PI32_CONST(2, 2);
75 _PI32_CONST(4, 4);
76 _PI32_CONST(0x7f, 0x7f);
77
78 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
79 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
80 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
81 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
82 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
83 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
84 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
85 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
86 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
87 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
88 _PS_CONST(cephes_log_q1, -2.12194440e-4);
89 _PS_CONST(cephes_log_q2, 0.693359375);
90
91 v4sf log_ps(v4sf x);
92 v4sf exp_ps(v4sf x);
93 v4sf sin_ps(v4sf x);
94 v4sf cos_ps(v4sf x);
95 void sincos_ps(v4sf x, v4sf *s, v4sf *c);
96
97 #ifndef USE_SSE2
98 typedef union xmm_mm_union {
99 __m128 xmm;
100 __m64 mm[2];
101 } xmm_mm_union;
102
103 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
104 xmm_mm_union u; u.xmm = xmm_; \
105 mm0_ = u.mm[0]; \
106 mm1_ = u.mm[1]; \
107 }
108
109 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
110 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
111 }
112
113 #endif // USE_SSE2
114
115 /* natural logarithm computed for 4 simultaneous float
116 return NaN for x <= 0
117 */
118 v4sf log_ps(v4sf x) {
119 #ifdef USE_SSE2
120 v4si emm0;
121 #else
122 v2si mm0, mm1;
123 #endif
124 v4sf one = *(v4sf*)_ps_1;
125
126 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
127 v4sf e, mask, tmp, z, y;
128
129 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
130
131 #ifndef USE_SSE2
132 /* part 1: x = frexpf(x, &e); */
133 COPY_XMM_TO_MM(x, mm0, mm1);
134 mm0 = _mm_srli_pi32(mm0, 23);
135 mm1 = _mm_srli_pi32(mm1, 23);
136 #else
137 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
138 #endif
139 /* keep only the fractional part */
140 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
141 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
142
143 #ifndef USE_SSE2
144 /* now e=mm0:mm1 contain the really base-2 exponent */
145 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
146 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
147 e = _mm_cvtpi32x2_ps(mm0, mm1);
148 _mm_empty(); /* bye bye mmx */
149 #else
150 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
151 e = _mm_cvtepi32_ps(emm0);
152 #endif
153
154 e = _mm_add_ps(e, one);
155
156 /* part2:
157 if( x < SQRTHF ) {
158 e -= 1;
159 x = x + x - 1.0;
160 } else { x = x - 1.0; }
161 */
162
163 mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
164 tmp = _mm_and_ps(x, mask);
165 x = _mm_sub_ps(x, one);
166 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
167 x = _mm_add_ps(x, tmp);
168
169
170 z = _mm_mul_ps(x,x);
171
172 y = *(v4sf*)_ps_cephes_log_p0;
173 y = _mm_mul_ps(y, x);
174 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
175 y = _mm_mul_ps(y, x);
176 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
177 y = _mm_mul_ps(y, x);
178 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
179 y = _mm_mul_ps(y, x);
180 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
181 y = _mm_mul_ps(y, x);
182 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
183 y = _mm_mul_ps(y, x);
184 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
185 y = _mm_mul_ps(y, x);
186 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
187 y = _mm_mul_ps(y, x);
188 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
189 y = _mm_mul_ps(y, x);
190
191 y = _mm_mul_ps(y, z);
192
193
194 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
195 y = _mm_add_ps(y, tmp);
196
197
198 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
199 y = _mm_sub_ps(y, tmp);
200
201 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
202 x = _mm_add_ps(x, y);
203 x = _mm_add_ps(x, tmp);
204 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
205 return x;
206 }
207
208 _PS_CONST(exp_hi, 88.3762626647949f);
209 _PS_CONST(exp_lo, -88.3762626647949f);
210
211 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
212 _PS_CONST(cephes_exp_C1, 0.693359375);
213 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
214
215 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
216 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
217 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
218 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
219 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
220 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
221
222 v4sf exp_ps(v4sf x) {
223 v4sf tmp = _mm_setzero_ps(), fx;
224 #ifdef USE_SSE2
225 v4si emm0;
226 #else
227 v2si mm0, mm1;
228 #endif
229 v4sf one = *(v4sf*)_ps_1;
230 v4sf mask, z, y, pow2n;
231
232 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
233 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
234
235 /* express exp(x) as exp(g + n*log(2)) */
236 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
237 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
238
239 /* how to perform a floorf with SSE: just below */
240 #ifndef USE_SSE2
241 /* step 1 : cast to int */
242 tmp = _mm_movehl_ps(tmp, fx);
243 mm0 = _mm_cvttps_pi32(fx);
244 mm1 = _mm_cvttps_pi32(tmp);
245 /* step 2 : cast back to float */
246 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
247 #else
248 emm0 = _mm_cvttps_epi32(fx);
249 tmp = _mm_cvtepi32_ps(emm0);
250 #endif
251 /* if greater, substract 1 */
252 mask = _mm_cmpgt_ps(tmp, fx);
253 mask = _mm_and_ps(mask, one);
254 fx = _mm_sub_ps(tmp, mask);
255
256 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
257 z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
258 x = _mm_sub_ps(x, tmp);
259 x = _mm_sub_ps(x, z);
260
261 z = _mm_mul_ps(x,x);
262
263 y = *(v4sf*)_ps_cephes_exp_p0;
264 y = _mm_mul_ps(y, x);
265 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
266 y = _mm_mul_ps(y, x);
267 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
268 y = _mm_mul_ps(y, x);
269 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
270 y = _mm_mul_ps(y, x);
271 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
272 y = _mm_mul_ps(y, x);
273 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
274 y = _mm_mul_ps(y, z);
275 y = _mm_add_ps(y, x);
276 y = _mm_add_ps(y, one);
277
278 /* build 2^n */
279 #ifndef USE_SSE2
280 z = _mm_movehl_ps(z, fx);
281 mm0 = _mm_cvttps_pi32(fx);
282 mm1 = _mm_cvttps_pi32(z);
283 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
284 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
285 mm0 = _mm_slli_pi32(mm0, 23);
286 mm1 = _mm_slli_pi32(mm1, 23);
287
288 COPY_MM_TO_XMM(mm0, mm1, pow2n);
289 _mm_empty();
290 #else
291 emm0 = _mm_cvttps_epi32(fx);
292 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
293 emm0 = _mm_slli_epi32(emm0, 23);
294 pow2n = _mm_castsi128_ps(emm0);
295 #endif
296 y = _mm_mul_ps(y, pow2n);
297 return y;
298 }
299
300 _PS_CONST(minus_cephes_DP1, -0.78515625);
301 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
302 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
303 _PS_CONST(sincof_p0, -1.9515295891E-4);
304 _PS_CONST(sincof_p1, 8.3321608736E-3);
305 _PS_CONST(sincof_p2, -1.6666654611E-1);
306 _PS_CONST(coscof_p0, 2.443315711809948E-005);
307 _PS_CONST(coscof_p1, -1.388731625493765E-003);
308 _PS_CONST(coscof_p2, 4.166664568298827E-002);
309 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
310
311
312 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
313 it runs also on old athlons XPs and the pentium III of your grand
314 mother.
315
316 The code is the exact rewriting of the cephes sinf function.
317 Precision is excellent as long as x < 8192 (I did not bother to
318 take into account the special handling they have for greater values
319 -- it does not return garbage for arguments over 8192, though, but
320 the extra precision is missing).
321
322 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
323 surprising but correct result.
324
325 Performance is also surprisingly good, 1.33 times faster than the
326 macos vsinf SSE2 function, and 1.5 times faster than the
327 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
328 too bad for an SSE1 function (with no special tuning) !
329 However the latter libraries probably have a much better handling of NaN,
330 Inf, denormalized and other special arguments..
331
332 On my core 1 duo, the execution of this function takes approximately 95 cycles.
333
334 From what I have observed on the experiments with Intel AMath lib, switching to an
335 SSE2 version would improve the perf by only 10%.
336
337 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
338 deliver full speed.
339 */
340 v4sf sin_ps(v4sf x) { // any x
341 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
342
343 #ifdef USE_SSE2
344 v4si emm0, emm2;
345 #else
346 v2si mm0, mm1, mm2, mm3;
347 #endif
348 v4sf swap_sign_bit, poly_mask, z, tmp, y2;
349
350 sign_bit = x;
351 /* take the absolute value */
352 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
353 /* extract the sign bit (upper one) */
354 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
355
356 /* scale by 4/Pi */
357 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
358
359 //printf("plop:"); print4(y);
360 #ifdef USE_SSE2
361 /* store the integer part of y in mm0 */
362 emm2 = _mm_cvttps_epi32(y);
363 /* j=(j+1) & (~1) (see the cephes sources) */
364 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
365 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
366 y = _mm_cvtepi32_ps(emm2);
367 /* get the swap sign flag */
368 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
369 emm0 = _mm_slli_epi32(emm0, 29);
370 /* get the polynom selection mask
371 there is one polynom for 0 <= x <= Pi/4
372 and another one for Pi/4<x<=Pi/2
373
374 Both branches will be computed.
375 */
376 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
377 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
378
379 swap_sign_bit = _mm_castsi128_ps(emm0);
380 poly_mask = _mm_castsi128_ps(emm2);
381 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
382 #else
383 /* store the integer part of y in mm0:mm1 */
384 xmm2 = _mm_movehl_ps(xmm2, y);
385 mm2 = _mm_cvttps_pi32(y);
386 mm3 = _mm_cvttps_pi32(xmm2);
387 /* j=(j+1) & (~1) (see the cephes sources) */
388 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
389 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
390 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
391 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
392 y = _mm_cvtpi32x2_ps(mm2, mm3);
393 /* get the swap sign flag */
394 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
395 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
396 mm0 = _mm_slli_pi32(mm0, 29);
397 mm1 = _mm_slli_pi32(mm1, 29);
398 /* get the polynom selection mask */
399 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
400 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
401 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
402 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
403
404 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
405 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
406 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
407 _mm_empty(); /* good-bye mmx */
408 #endif
409
410 /* The magic pass: "Extended precision modular arithmetic"
411 x = ((x - y * DP1) - y * DP2) - y * DP3; */
412 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
413 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
414 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
415 xmm1 = _mm_mul_ps(y, xmm1);
416 xmm2 = _mm_mul_ps(y, xmm2);
417 xmm3 = _mm_mul_ps(y, xmm3);
418 x = _mm_add_ps(x, xmm1);
419 x = _mm_add_ps(x, xmm2);
420 x = _mm_add_ps(x, xmm3);
421
422 /* Evaluate the first polynom (0 <= x <= Pi/4) */
423 y = *(v4sf*)_ps_coscof_p0;
424 z = _mm_mul_ps(x,x);
425
426 y = _mm_mul_ps(y, z);
427 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
428 y = _mm_mul_ps(y, z);
429 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
430 y = _mm_mul_ps(y, z);
431 y = _mm_mul_ps(y, z);
432 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
433 y = _mm_sub_ps(y, tmp);
434 y = _mm_add_ps(y, *(v4sf*)_ps_1);
435
436 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
437
438 y2 = *(v4sf*)_ps_sincof_p0;
439 y2 = _mm_mul_ps(y2, z);
440 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
441 y2 = _mm_mul_ps(y2, z);
442 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
443 y2 = _mm_mul_ps(y2, z);
444 y2 = _mm_mul_ps(y2, x);
445 y2 = _mm_add_ps(y2, x);
446
447 /* select the correct result from the two polynoms */
448 xmm3 = poly_mask;
449 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
450 y = _mm_andnot_ps(xmm3, y);
451 y = _mm_add_ps(y,y2);
452 /* update the sign */
453 y = _mm_xor_ps(y, sign_bit);
454
455 return y;
456 }
457
458 /* almost the same as sin_ps */
459 v4sf cos_ps(v4sf x) { // any x
460 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
461 #ifdef USE_SSE2
462 v4si emm0, emm2;
463 #else
464 v2si mm0, mm1, mm2, mm3;
465 #endif
466 v4sf sign_bit, poly_mask, z, tmp, y2;
467
468 /* take the absolute value */
469 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
470
471 /* scale by 4/Pi */
472 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
473
474 #ifdef USE_SSE2
475 /* store the integer part of y in mm0 */
476 emm2 = _mm_cvttps_epi32(y);
477 /* j=(j+1) & (~1) (see the cephes sources) */
478 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
479 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
480 y = _mm_cvtepi32_ps(emm2);
481
482 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
483
484 /* get the swap sign flag */
485 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
486 emm0 = _mm_slli_epi32(emm0, 29);
487 /* get the polynom selection mask */
488 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
489 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
490
491 sign_bit = _mm_castsi128_ps(emm0);
492 poly_mask = _mm_castsi128_ps(emm2);
493 #else
494 /* store the integer part of y in mm0:mm1 */
495 xmm2 = _mm_movehl_ps(xmm2, y);
496 mm2 = _mm_cvttps_pi32(y);
497 mm3 = _mm_cvttps_pi32(xmm2);
498
499 /* j=(j+1) & (~1) (see the cephes sources) */
500 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
501 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
502 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
503 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
504
505 y = _mm_cvtpi32x2_ps(mm2, mm3);
506
507
508 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
509 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
510
511 /* get the swap sign flag in mm0:mm1 and the
512 polynom selection mask in mm2:mm3 */
513
514 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
515 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
516 mm0 = _mm_slli_pi32(mm0, 29);
517 mm1 = _mm_slli_pi32(mm1, 29);
518
519 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
520 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
521
522 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
523 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
524
525 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
526 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
527 _mm_empty(); /* good-bye mmx */
528 #endif
529 /* The magic pass: "Extended precision modular arithmetic"
530 x = ((x - y * DP1) - y * DP2) - y * DP3; */
531 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
532 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
533 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
534 xmm1 = _mm_mul_ps(y, xmm1);
535 xmm2 = _mm_mul_ps(y, xmm2);
536 xmm3 = _mm_mul_ps(y, xmm3);
537 x = _mm_add_ps(x, xmm1);
538 x = _mm_add_ps(x, xmm2);
539 x = _mm_add_ps(x, xmm3);
540
541 /* Evaluate the first polynom (0 <= x <= Pi/4) */
542 y = *(v4sf*)_ps_coscof_p0;
543 z = _mm_mul_ps(x,x);
544
545 y = _mm_mul_ps(y, z);
546 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
547 y = _mm_mul_ps(y, z);
548 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
549 y = _mm_mul_ps(y, z);
550 y = _mm_mul_ps(y, z);
551 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
552 y = _mm_sub_ps(y, tmp);
553 y = _mm_add_ps(y, *(v4sf*)_ps_1);
554
555 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
556
557 y2 = *(v4sf*)_ps_sincof_p0;
558 y2 = _mm_mul_ps(y2, z);
559 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
560 y2 = _mm_mul_ps(y2, z);
561 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
562 y2 = _mm_mul_ps(y2, z);
563 y2 = _mm_mul_ps(y2, x);
564 y2 = _mm_add_ps(y2, x);
565
566 /* select the correct result from the two polynoms */
567 xmm3 = poly_mask;
568 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
569 y = _mm_andnot_ps(xmm3, y);
570 y = _mm_add_ps(y,y2);
571 /* update the sign */
572 y = _mm_xor_ps(y, sign_bit);
573
574 return y;
575 }
576
577 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
578 it is almost as fast, and gives you a free cosine with your sine */
579 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
580 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
581 #ifdef USE_SSE2
582 v4si emm0, emm2, emm4;
583 #else
584 v2si mm0, mm1, mm2, mm3, mm4, mm5;
585 #endif
586 v4sf swap_sign_bit_sin, poly_mask, z, tmp, y2, ysin1, ysin2;
587 v4sf sign_bit_cos;
588
589 sign_bit_sin = x;
590 /* take the absolute value */
591 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
592 /* extract the sign bit (upper one) */
593 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
594
595 /* scale by 4/Pi */
596 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
597
598 #ifdef USE_SSE2
599 /* store the integer part of y in emm2 */
600 emm2 = _mm_cvttps_epi32(y);
601
602 /* j=(j+1) & (~1) (see the cephes sources) */
603 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
604 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
605 y = _mm_cvtepi32_ps(emm2);
606
607 emm4 = emm2;
608
609 /* get the swap sign flag for the sine */
610 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
611 emm0 = _mm_slli_epi32(emm0, 29);
612 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
613
614 /* get the polynom selection mask for the sine*/
615 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
616 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
617 poly_mask = _mm_castsi128_ps(emm2);
618 #else
619 /* store the integer part of y in mm2:mm3 */
620 xmm3 = _mm_movehl_ps(xmm3, y);
621 mm2 = _mm_cvttps_pi32(y);
622 mm3 = _mm_cvttps_pi32(xmm3);
623
624 /* j=(j+1) & (~1) (see the cephes sources) */
625 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
626 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
627 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
628 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
629
630 y = _mm_cvtpi32x2_ps(mm2, mm3);
631
632 mm4 = mm2;
633 mm5 = mm3;
634
635 /* get the swap sign flag for the sine */
636 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
637 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
638 mm0 = _mm_slli_pi32(mm0, 29);
639 mm1 = _mm_slli_pi32(mm1, 29);
640
641 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
642
643 /* get the polynom selection mask for the sine */
644
645 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
646 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
647 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
648 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
649
650 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
651 #endif
652
653 /* The magic pass: "Extended precision modular arithmetic"
654 x = ((x - y * DP1) - y * DP2) - y * DP3; */
655 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
656 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
657 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
658 xmm1 = _mm_mul_ps(y, xmm1);
659 xmm2 = _mm_mul_ps(y, xmm2);
660 xmm3 = _mm_mul_ps(y, xmm3);
661 x = _mm_add_ps(x, xmm1);
662 x = _mm_add_ps(x, xmm2);
663 x = _mm_add_ps(x, xmm3);
664
665 #ifdef USE_SSE2
666 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
667 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
668 emm4 = _mm_slli_epi32(emm4, 29);
669 sign_bit_cos = _mm_castsi128_ps(emm4);
670 #else
671 /* get the sign flag for the cosine */
672 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
673 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
674 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
675 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
676 mm4 = _mm_slli_pi32(mm4, 29);
677 mm5 = _mm_slli_pi32(mm5, 29);
678 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
679 _mm_empty(); /* good-bye mmx */
680 #endif
681
682 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
683
684
685 /* Evaluate the first polynom (0 <= x <= Pi/4) */
686 z = _mm_mul_ps(x,x);
687 y = *(v4sf*)_ps_coscof_p0;
688
689 y = _mm_mul_ps(y, z);
690 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
691 y = _mm_mul_ps(y, z);
692 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
693 y = _mm_mul_ps(y, z);
694 y = _mm_mul_ps(y, z);
695 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
696 y = _mm_sub_ps(y, tmp);
697 y = _mm_add_ps(y, *(v4sf*)_ps_1);
698
699 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
700
701 y2 = *(v4sf*)_ps_sincof_p0;
702 y2 = _mm_mul_ps(y2, z);
703 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
704 y2 = _mm_mul_ps(y2, z);
705 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
706 y2 = _mm_mul_ps(y2, z);
707 y2 = _mm_mul_ps(y2, x);
708 y2 = _mm_add_ps(y2, x);
709
710 /* select the correct result from the two polynoms */
711 xmm3 = poly_mask;
712 ysin2 = _mm_and_ps(xmm3, y2);
713 ysin1 = _mm_andnot_ps(xmm3, y);
714 y2 = _mm_sub_ps(y2,ysin2);
715 y = _mm_sub_ps(y, ysin1);
716
717 xmm1 = _mm_add_ps(ysin1,ysin2);
718 xmm2 = _mm_add_ps(y,y2);
719
720 /* update the sign */
721 *s = _mm_xor_ps(xmm1, sign_bit_sin);
722 *c = _mm_xor_ps(xmm2, sign_bit_cos);
723 }
724