1 /* Copyright (C) 2003, 2004 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 8.0. */
30 #ifndef _EMMINTRIN_H_INCLUDED
31 #define _EMMINTRIN_H_INCLUDED
34 #include <xmmintrin.h>
37 typedef double __v2df
__attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di
__attribute__ ((__vector_size__ (16)));
39 typedef int __v4si
__attribute__ ((__vector_size__ (16)));
40 typedef short __v8hi
__attribute__ ((__vector_size__ (16)));
41 typedef char __v16qi
__attribute__ ((__vector_size__ (16)));
43 /* Create a selector for use with the SHUFPD instruction. */
44 #define _MM_SHUFFLE2(fp1,fp0) \
45 (((fp1) << 1) | (fp0))
47 #define __m128i __v2di
48 #define __m128d __v2df
50 /* Create a vector with element 0 as *P and the rest zero. */
51 static __inline __m128d
52 _mm_load_sd (double const *__P
)
54 return (__m128d
) __builtin_ia32_loadsd (__P
);
57 /* Create a vector with all two elements equal to *P. */
58 static __inline __m128d
59 _mm_load1_pd (double const *__P
)
61 __v2df __tmp
= __builtin_ia32_loadsd (__P
);
62 return (__m128d
) __builtin_ia32_shufpd (__tmp
, __tmp
, _MM_SHUFFLE2 (0,0));
65 static __inline __m128d
66 _mm_load_pd1 (double const *__P
)
68 return _mm_load1_pd (__P
);
71 /* Load two DPFP values from P. The address must be 16-byte aligned. */
72 static __inline __m128d
73 _mm_load_pd (double const *__P
)
75 return (__m128d
) __builtin_ia32_loadapd (__P
);
78 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
79 static __inline __m128d
80 _mm_loadu_pd (double const *__P
)
82 return (__m128d
) __builtin_ia32_loadupd (__P
);
85 /* Load two DPFP values in reverse order. The address must be aligned. */
86 static __inline __m128d
87 _mm_loadr_pd (double const *__P
)
89 __v2df __tmp
= __builtin_ia32_loadapd (__P
);
90 return (__m128d
) __builtin_ia32_shufpd (__tmp
, __tmp
, _MM_SHUFFLE2 (0,1));
93 /* Create a vector with element 0 as F and the rest zero. */
94 static __inline __m128d
95 _mm_set_sd (double __F
)
97 return (__m128d
) __builtin_ia32_loadsd (&__F
);
100 /* Create a vector with all two elements equal to F. */
101 static __inline __m128d
102 _mm_set1_pd (double __F
)
104 __v2df __tmp
= __builtin_ia32_loadsd (&__F
);
105 return (__m128d
) __builtin_ia32_shufpd (__tmp
, __tmp
, _MM_SHUFFLE2 (0,0));
108 static __inline __m128d
109 _mm_set_pd1 (double __F
)
111 return _mm_set1_pd (__F
);
114 /* Create the vector [Z Y]. */
115 static __inline __m128d
116 _mm_set_pd (double __Z
, double __Y
)
118 return (__v2df
) {__Y
, __Z
};
121 /* Create the vector [Y Z]. */
122 static __inline __m128d
123 _mm_setr_pd (double __Z
, double __Y
)
125 return _mm_set_pd (__Y
, __Z
);
128 /* Create a vector of zeros. */
129 static __inline __m128d
130 _mm_setzero_pd (void)
132 return (__m128d
) __builtin_ia32_setzeropd ();
135 /* Stores the lower DPFP value. */
137 _mm_store_sd (double *__P
, __m128d __A
)
139 __builtin_ia32_storesd (__P
, (__v2df
)__A
);
142 /* Store the lower DPFP value across two words. */
144 _mm_store1_pd (double *__P
, __m128d __A
)
146 __v2df __va
= (__v2df
)__A
;
147 __v2df __tmp
= __builtin_ia32_shufpd (__va
, __va
, _MM_SHUFFLE2 (0,0));
148 __builtin_ia32_storeapd (__P
, __tmp
);
152 _mm_store_pd1 (double *__P
, __m128d __A
)
154 _mm_store1_pd (__P
, __A
);
157 /* Store two DPFP values. The address must be 16-byte aligned. */
159 _mm_store_pd (double *__P
, __m128d __A
)
161 __builtin_ia32_storeapd (__P
, (__v2df
)__A
);
164 /* Store two DPFP values. The address need not be 16-byte aligned. */
166 _mm_storeu_pd (double *__P
, __m128d __A
)
168 __builtin_ia32_storeupd (__P
, (__v2df
)__A
);
171 /* Store two DPFP values in reverse order. The address must be aligned. */
173 _mm_storer_pd (double *__P
, __m128d __A
)
175 __v2df __va
= (__v2df
)__A
;
176 __v2df __tmp
= __builtin_ia32_shufpd (__va
, __va
, _MM_SHUFFLE2 (0,1));
177 __builtin_ia32_storeapd (__P
, __tmp
);
181 _mm_cvtsi128_si32 (__m128i __A
)
184 __builtin_ia32_stored (&__tmp
, (__v4si
)__A
);
189 static __inline
long long
190 _mm_cvtsi128_si64x (__m128i __A
)
192 return __builtin_ia32_movdq2q ((__v2di
)__A
);
196 /* Sets the low DPFP value of A from the low value of B. */
197 static __inline __m128d
198 _mm_move_sd (__m128d __A
, __m128d __B
)
200 return (__m128d
) __builtin_ia32_movsd ((__v2df
)__A
, (__v2df
)__B
);
204 static __inline __m128d
205 _mm_add_pd (__m128d __A
, __m128d __B
)
207 return (__m128d
)__builtin_ia32_addpd ((__v2df
)__A
, (__v2df
)__B
);
210 static __inline __m128d
211 _mm_add_sd (__m128d __A
, __m128d __B
)
213 return (__m128d
)__builtin_ia32_addsd ((__v2df
)__A
, (__v2df
)__B
);
216 static __inline __m128d
217 _mm_sub_pd (__m128d __A
, __m128d __B
)
219 return (__m128d
)__builtin_ia32_subpd ((__v2df
)__A
, (__v2df
)__B
);
222 static __inline __m128d
223 _mm_sub_sd (__m128d __A
, __m128d __B
)
225 return (__m128d
)__builtin_ia32_subsd ((__v2df
)__A
, (__v2df
)__B
);
228 static __inline __m128d
229 _mm_mul_pd (__m128d __A
, __m128d __B
)
231 return (__m128d
)__builtin_ia32_mulpd ((__v2df
)__A
, (__v2df
)__B
);
234 static __inline __m128d
235 _mm_mul_sd (__m128d __A
, __m128d __B
)
237 return (__m128d
)__builtin_ia32_mulsd ((__v2df
)__A
, (__v2df
)__B
);
240 static __inline __m128d
241 _mm_div_pd (__m128d __A
, __m128d __B
)
243 return (__m128d
)__builtin_ia32_divpd ((__v2df
)__A
, (__v2df
)__B
);
246 static __inline __m128d
247 _mm_div_sd (__m128d __A
, __m128d __B
)
249 return (__m128d
)__builtin_ia32_divsd ((__v2df
)__A
, (__v2df
)__B
);
252 static __inline __m128d
253 _mm_sqrt_pd (__m128d __A
)
255 return (__m128d
)__builtin_ia32_sqrtpd ((__v2df
)__A
);
258 /* Return pair {sqrt (A[0), B[1]}. */
259 static __inline __m128d
260 _mm_sqrt_sd (__m128d __A
, __m128d __B
)
262 __v2df __tmp
= __builtin_ia32_movsd ((__v2df
)__A
, (__v2df
)__B
);
263 return (__m128d
)__builtin_ia32_sqrtsd ((__v2df
)__tmp
);
266 static __inline __m128d
267 _mm_min_pd (__m128d __A
, __m128d __B
)
269 return (__m128d
)__builtin_ia32_minpd ((__v2df
)__A
, (__v2df
)__B
);
272 static __inline __m128d
273 _mm_min_sd (__m128d __A
, __m128d __B
)
275 return (__m128d
)__builtin_ia32_minsd ((__v2df
)__A
, (__v2df
)__B
);
278 static __inline __m128d
279 _mm_max_pd (__m128d __A
, __m128d __B
)
281 return (__m128d
)__builtin_ia32_maxpd ((__v2df
)__A
, (__v2df
)__B
);
284 static __inline __m128d
285 _mm_max_sd (__m128d __A
, __m128d __B
)
287 return (__m128d
)__builtin_ia32_maxsd ((__v2df
)__A
, (__v2df
)__B
);
290 static __inline __m128d
291 _mm_and_pd (__m128d __A
, __m128d __B
)
293 return (__m128d
)__builtin_ia32_andpd ((__v2df
)__A
, (__v2df
)__B
);
296 static __inline __m128d
297 _mm_andnot_pd (__m128d __A
, __m128d __B
)
299 return (__m128d
)__builtin_ia32_andnpd ((__v2df
)__A
, (__v2df
)__B
);
302 static __inline __m128d
303 _mm_or_pd (__m128d __A
, __m128d __B
)
305 return (__m128d
)__builtin_ia32_orpd ((__v2df
)__A
, (__v2df
)__B
);
308 static __inline __m128d
309 _mm_xor_pd (__m128d __A
, __m128d __B
)
311 return (__m128d
)__builtin_ia32_xorpd ((__v2df
)__A
, (__v2df
)__B
);
314 static __inline __m128d
315 _mm_cmpeq_pd (__m128d __A
, __m128d __B
)
317 return (__m128d
)__builtin_ia32_cmpeqpd ((__v2df
)__A
, (__v2df
)__B
);
320 static __inline __m128d
321 _mm_cmplt_pd (__m128d __A
, __m128d __B
)
323 return (__m128d
)__builtin_ia32_cmpltpd ((__v2df
)__A
, (__v2df
)__B
);
326 static __inline __m128d
327 _mm_cmple_pd (__m128d __A
, __m128d __B
)
329 return (__m128d
)__builtin_ia32_cmplepd ((__v2df
)__A
, (__v2df
)__B
);
332 static __inline __m128d
333 _mm_cmpgt_pd (__m128d __A
, __m128d __B
)
335 return (__m128d
)__builtin_ia32_cmpgtpd ((__v2df
)__A
, (__v2df
)__B
);
338 static __inline __m128d
339 _mm_cmpge_pd (__m128d __A
, __m128d __B
)
341 return (__m128d
)__builtin_ia32_cmpgepd ((__v2df
)__A
, (__v2df
)__B
);
344 static __inline __m128d
345 _mm_cmpneq_pd (__m128d __A
, __m128d __B
)
347 return (__m128d
)__builtin_ia32_cmpneqpd ((__v2df
)__A
, (__v2df
)__B
);
350 static __inline __m128d
351 _mm_cmpnlt_pd (__m128d __A
, __m128d __B
)
353 return (__m128d
)__builtin_ia32_cmpnltpd ((__v2df
)__A
, (__v2df
)__B
);
356 static __inline __m128d
357 _mm_cmpnle_pd (__m128d __A
, __m128d __B
)
359 return (__m128d
)__builtin_ia32_cmpnlepd ((__v2df
)__A
, (__v2df
)__B
);
362 static __inline __m128d
363 _mm_cmpngt_pd (__m128d __A
, __m128d __B
)
365 return (__m128d
)__builtin_ia32_cmpngtpd ((__v2df
)__A
, (__v2df
)__B
);
368 static __inline __m128d
369 _mm_cmpnge_pd (__m128d __A
, __m128d __B
)
371 return (__m128d
)__builtin_ia32_cmpngepd ((__v2df
)__A
, (__v2df
)__B
);
374 static __inline __m128d
375 _mm_cmpord_pd (__m128d __A
, __m128d __B
)
377 return (__m128d
)__builtin_ia32_cmpordpd ((__v2df
)__A
, (__v2df
)__B
);
380 static __inline __m128d
381 _mm_cmpunord_pd (__m128d __A
, __m128d __B
)
383 return (__m128d
)__builtin_ia32_cmpunordpd ((__v2df
)__A
, (__v2df
)__B
);
386 static __inline __m128d
387 _mm_cmpeq_sd (__m128d __A
, __m128d __B
)
389 return (__m128d
)__builtin_ia32_cmpeqsd ((__v2df
)__A
, (__v2df
)__B
);
392 static __inline __m128d
393 _mm_cmplt_sd (__m128d __A
, __m128d __B
)
395 return (__m128d
)__builtin_ia32_cmpltsd ((__v2df
)__A
, (__v2df
)__B
);
398 static __inline __m128d
399 _mm_cmple_sd (__m128d __A
, __m128d __B
)
401 return (__m128d
)__builtin_ia32_cmplesd ((__v2df
)__A
, (__v2df
)__B
);
404 static __inline __m128d
405 _mm_cmpgt_sd (__m128d __A
, __m128d __B
)
407 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
409 __builtin_ia32_cmpltsd ((__v2df
) __B
,
414 static __inline __m128d
415 _mm_cmpge_sd (__m128d __A
, __m128d __B
)
417 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
419 __builtin_ia32_cmplesd ((__v2df
) __B
,
424 static __inline __m128d
425 _mm_cmpneq_sd (__m128d __A
, __m128d __B
)
427 return (__m128d
)__builtin_ia32_cmpneqsd ((__v2df
)__A
, (__v2df
)__B
);
430 static __inline __m128d
431 _mm_cmpnlt_sd (__m128d __A
, __m128d __B
)
433 return (__m128d
)__builtin_ia32_cmpnltsd ((__v2df
)__A
, (__v2df
)__B
);
436 static __inline __m128d
437 _mm_cmpnle_sd (__m128d __A
, __m128d __B
)
439 return (__m128d
)__builtin_ia32_cmpnlesd ((__v2df
)__A
, (__v2df
)__B
);
442 static __inline __m128d
443 _mm_cmpngt_sd (__m128d __A
, __m128d __B
)
445 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
447 __builtin_ia32_cmpnltsd ((__v2df
) __B
,
452 static __inline __m128d
453 _mm_cmpnge_sd (__m128d __A
, __m128d __B
)
455 return (__m128d
) __builtin_ia32_movsd ((__v2df
) __A
,
457 __builtin_ia32_cmpnlesd ((__v2df
) __B
,
462 static __inline __m128d
463 _mm_cmpord_sd (__m128d __A
, __m128d __B
)
465 return (__m128d
)__builtin_ia32_cmpordsd ((__v2df
)__A
, (__v2df
)__B
);
468 static __inline __m128d
469 _mm_cmpunord_sd (__m128d __A
, __m128d __B
)
471 return (__m128d
)__builtin_ia32_cmpunordsd ((__v2df
)__A
, (__v2df
)__B
);
475 _mm_comieq_sd (__m128d __A
, __m128d __B
)
477 return __builtin_ia32_comisdeq ((__v2df
)__A
, (__v2df
)__B
);
481 _mm_comilt_sd (__m128d __A
, __m128d __B
)
483 return __builtin_ia32_comisdlt ((__v2df
)__A
, (__v2df
)__B
);
487 _mm_comile_sd (__m128d __A
, __m128d __B
)
489 return __builtin_ia32_comisdle ((__v2df
)__A
, (__v2df
)__B
);
493 _mm_comigt_sd (__m128d __A
, __m128d __B
)
495 return __builtin_ia32_comisdgt ((__v2df
)__A
, (__v2df
)__B
);
499 _mm_comige_sd (__m128d __A
, __m128d __B
)
501 return __builtin_ia32_comisdge ((__v2df
)__A
, (__v2df
)__B
);
505 _mm_comineq_sd (__m128d __A
, __m128d __B
)
507 return __builtin_ia32_comisdneq ((__v2df
)__A
, (__v2df
)__B
);
511 _mm_ucomieq_sd (__m128d __A
, __m128d __B
)
513 return __builtin_ia32_ucomisdeq ((__v2df
)__A
, (__v2df
)__B
);
517 _mm_ucomilt_sd (__m128d __A
, __m128d __B
)
519 return __builtin_ia32_ucomisdlt ((__v2df
)__A
, (__v2df
)__B
);
523 _mm_ucomile_sd (__m128d __A
, __m128d __B
)
525 return __builtin_ia32_ucomisdle ((__v2df
)__A
, (__v2df
)__B
);
529 _mm_ucomigt_sd (__m128d __A
, __m128d __B
)
531 return __builtin_ia32_ucomisdgt ((__v2df
)__A
, (__v2df
)__B
);
535 _mm_ucomige_sd (__m128d __A
, __m128d __B
)
537 return __builtin_ia32_ucomisdge ((__v2df
)__A
, (__v2df
)__B
);
541 _mm_ucomineq_sd (__m128d __A
, __m128d __B
)
543 return __builtin_ia32_ucomisdneq ((__v2df
)__A
, (__v2df
)__B
);
546 /* Create a vector with element 0 as *P and the rest zero. */
548 static __inline __m128i
549 _mm_load_si128 (__m128i
const *__P
)
551 return (__m128i
) __builtin_ia32_loaddqa ((char const *)__P
);
554 static __inline __m128i
555 _mm_loadu_si128 (__m128i
const *__P
)
557 return (__m128i
) __builtin_ia32_loaddqu ((char const *)__P
);
560 static __inline __m128i
561 _mm_loadl_epi64 (__m128i
const *__P
)
563 return (__m128i
) __builtin_ia32_movq2dq (*(unsigned long long *)__P
);
567 _mm_store_si128 (__m128i
*__P
, __m128i __B
)
569 __builtin_ia32_storedqa ((char *)__P
, (__v16qi
)__B
);
573 _mm_storeu_si128 (__m128i
*__P
, __m128i __B
)
575 __builtin_ia32_storedqu ((char *)__P
, (__v16qi
)__B
);
579 _mm_storel_epi64 (__m128i
*__P
, __m128i __B
)
581 *(long long *)__P
= __builtin_ia32_movdq2q ((__v2di
)__B
);
584 static __inline __m64
585 _mm_movepi64_pi64 (__m128i __B
)
587 return (__m64
) __builtin_ia32_movdq2q ((__v2di
)__B
);
590 static __inline __m128i
591 _mm_move_epi64 (__m128i __A
)
593 return (__m128i
) __builtin_ia32_movq ((__v2di
)__A
);
596 /* Create a vector of zeros. */
597 static __inline __m128i
598 _mm_setzero_si128 (void)
600 return (__m128i
) __builtin_ia32_setzero128 ();
603 static __inline __m128i
604 _mm_set_epi64 (__m64 __A
, __m64 __B
)
606 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
607 __v2di __tmp2
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__B
);
608 return (__m128i
)__builtin_ia32_punpcklqdq128 (__tmp2
, __tmp
);
611 /* Create the vector [Z Y X W]. */
612 static __inline __m128i
613 _mm_set_epi32 (int __Z
, int __Y
, int __X
, int __W
)
629 /* Create the vector [Z Y]. */
630 static __inline __m128i
631 _mm_set_epi64x (long long __Z
, long long __Y
)
645 /* Create the vector [S T U V Z Y X W]. */
646 static __inline __m128i
647 _mm_set_epi16 (short __Z
, short __Y
, short __X
, short __W
,
648 short __V
, short __U
, short __T
, short __S
)
667 /* Create the vector [S T U V Z Y X W]. */
668 static __inline __m128i
669 _mm_set_epi8 (char __Z
, char __Y
, char __X
, char __W
,
670 char __V
, char __U
, char __T
, char __S
,
671 char __Z1
, char __Y1
, char __X1
, char __W1
,
672 char __V1
, char __U1
, char __T1
, char __S1
)
699 static __inline __m128i
700 _mm_set1_epi64 (__m64 __A
)
702 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
703 return (__m128i
)__builtin_ia32_punpcklqdq128 (__tmp
, __tmp
);
706 static __inline __m128i
707 _mm_set1_epi32 (int __A
)
709 __v4si __tmp
= (__v4si
)__builtin_ia32_loadd (&__A
);
710 return (__m128i
) __builtin_ia32_pshufd ((__v4si
)__tmp
, _MM_SHUFFLE (0,0,0,0));
714 static __inline __m128i
715 _mm_set1_epi64x (long long __A
)
717 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
718 return (__m128i
) __builtin_ia32_shufpd ((__v2df
)__tmp
, (__v2df
)__tmp
, _MM_SHUFFLE2 (0,0));
722 static __inline __m128i
723 _mm_set1_epi16 (short __A
)
725 int __Acopy
= (unsigned short)__A
;
726 __v4si __tmp
= (__v4si
)__builtin_ia32_loadd (&__Acopy
);
727 __tmp
= (__v4si
)__builtin_ia32_punpcklwd128 ((__v8hi
)__tmp
, (__v8hi
)__tmp
);
728 return (__m128i
) __builtin_ia32_pshufd ((__v4si
)__tmp
, _MM_SHUFFLE (0,0,0,0));
731 static __inline __m128i
732 _mm_set1_epi8 (char __A
)
734 int __Acopy
= (unsigned char)__A
;
735 __v4si __tmp
= (__v4si
)__builtin_ia32_loadd (&__Acopy
);
736 __tmp
= (__v4si
)__builtin_ia32_punpcklbw128 ((__v16qi
)__tmp
, (__v16qi
)__tmp
);
737 __tmp
= (__v4si
)__builtin_ia32_punpcklbw128 ((__v16qi
)__tmp
, (__v16qi
)__tmp
);
738 return (__m128i
) __builtin_ia32_pshufd ((__v4si
)__tmp
, _MM_SHUFFLE (0,0,0,0));
741 static __inline __m128i
742 _mm_setr_epi64 (__m64 __A
, __m64 __B
)
744 __v2di __tmp
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
745 __v2di __tmp2
= (__v2di
)__builtin_ia32_movq2dq ((unsigned long long)__B
);
746 return (__m128i
)__builtin_ia32_punpcklqdq128 (__tmp
, __tmp2
);
749 /* Create the vector [Z Y X W]. */
750 static __inline __m128i
751 _mm_setr_epi32 (int __W
, int __X
, int __Y
, int __Z
)
765 /* Create the vector [S T U V Z Y X W]. */
766 static __inline __m128i
767 _mm_setr_epi16 (short __S
, short __T
, short __U
, short __V
,
768 short __W
, short __X
, short __Y
, short __Z
)
787 /* Create the vector [S T U V Z Y X W]. */
788 static __inline __m128i
789 _mm_setr_epi8 (char __S1
, char __T1
, char __U1
, char __V1
,
790 char __W1
, char __X1
, char __Y1
, char __Z1
,
791 char __S
, char __T
, char __U
, char __V
,
792 char __W
, char __X
, char __Y
, char __Z
)
819 static __inline __m128d
820 _mm_cvtepi32_pd (__m128i __A
)
822 return (__m128d
)__builtin_ia32_cvtdq2pd ((__v4si
) __A
);
825 static __inline __m128
826 _mm_cvtepi32_ps (__m128i __A
)
828 return (__m128
)__builtin_ia32_cvtdq2ps ((__v4si
) __A
);
831 static __inline __m128i
832 _mm_cvtpd_epi32 (__m128d __A
)
834 return (__m128i
)__builtin_ia32_cvtpd2dq ((__v2df
) __A
);
837 static __inline __m64
838 _mm_cvtpd_pi32 (__m128d __A
)
840 return (__m64
)__builtin_ia32_cvtpd2pi ((__v2df
) __A
);
843 static __inline __m128
844 _mm_cvtpd_ps (__m128d __A
)
846 return (__m128
)__builtin_ia32_cvtpd2ps ((__v2df
) __A
);
849 static __inline __m128i
850 _mm_cvttpd_epi32 (__m128d __A
)
852 return (__m128i
)__builtin_ia32_cvttpd2dq ((__v2df
) __A
);
855 static __inline __m64
856 _mm_cvttpd_pi32 (__m128d __A
)
858 return (__m64
)__builtin_ia32_cvttpd2pi ((__v2df
) __A
);
861 static __inline __m128d
862 _mm_cvtpi32_pd (__m64 __A
)
864 return (__m128d
)__builtin_ia32_cvtpi2pd ((__v2si
) __A
);
867 static __inline __m128i
868 _mm_cvtps_epi32 (__m128 __A
)
870 return (__m128i
)__builtin_ia32_cvtps2dq ((__v4sf
) __A
);
873 static __inline __m128i
874 _mm_cvttps_epi32 (__m128 __A
)
876 return (__m128i
)__builtin_ia32_cvttps2dq ((__v4sf
) __A
);
879 static __inline __m128d
880 _mm_cvtps_pd (__m128 __A
)
882 return (__m128d
)__builtin_ia32_cvtps2pd ((__v4sf
) __A
);
886 _mm_cvtsd_si32 (__m128d __A
)
888 return __builtin_ia32_cvtsd2si ((__v2df
) __A
);
892 static __inline
long long
893 _mm_cvtsd_si64x (__m128d __A
)
895 return __builtin_ia32_cvtsd2si64 ((__v2df
) __A
);
900 _mm_cvttsd_si32 (__m128d __A
)
902 return __builtin_ia32_cvttsd2si ((__v2df
) __A
);
906 static __inline
long long
907 _mm_cvttsd_si64x (__m128d __A
)
909 return __builtin_ia32_cvttsd2si64 ((__v2df
) __A
);
913 static __inline __m128
914 _mm_cvtsd_ss (__m128 __A
, __m128d __B
)
916 return (__m128
)__builtin_ia32_cvtsd2ss ((__v4sf
) __A
, (__v2df
) __B
);
919 static __inline __m128d
920 _mm_cvtsi32_sd (__m128d __A
, int __B
)
922 return (__m128d
)__builtin_ia32_cvtsi2sd ((__v2df
) __A
, __B
);
926 static __inline __m128d
927 _mm_cvtsi64x_sd (__m128d __A
, long long __B
)
929 return (__m128d
)__builtin_ia32_cvtsi642sd ((__v2df
) __A
, __B
);
933 static __inline __m128d
934 _mm_cvtss_sd (__m128d __A
, __m128 __B
)
936 return (__m128d
)__builtin_ia32_cvtss2sd ((__v2df
) __A
, (__v4sf
)__B
);
939 #define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
941 static __inline __m128d
942 _mm_unpackhi_pd (__m128d __A
, __m128d __B
)
944 return (__m128d
)__builtin_ia32_unpckhpd ((__v2df
)__A
, (__v2df
)__B
);
947 static __inline __m128d
948 _mm_unpacklo_pd (__m128d __A
, __m128d __B
)
950 return (__m128d
)__builtin_ia32_unpcklpd ((__v2df
)__A
, (__v2df
)__B
);
953 static __inline __m128d
954 _mm_loadh_pd (__m128d __A
, double const *__B
)
956 return (__m128d
)__builtin_ia32_loadhpd ((__v2df
)__A
, __B
);
960 _mm_storeh_pd (double *__A
, __m128d __B
)
962 __builtin_ia32_storehpd (__A
, (__v2df
)__B
);
965 static __inline __m128d
966 _mm_loadl_pd (__m128d __A
, double const *__B
)
968 return (__m128d
)__builtin_ia32_loadlpd ((__v2df
)__A
, __B
);
972 _mm_storel_pd (double *__A
, __m128d __B
)
974 __builtin_ia32_storelpd (__A
, (__v2df
)__B
);
978 _mm_movemask_pd (__m128d __A
)
980 return __builtin_ia32_movmskpd ((__v2df
)__A
);
983 static __inline __m128i
984 _mm_packs_epi16 (__m128i __A
, __m128i __B
)
986 return (__m128i
)__builtin_ia32_packsswb128 ((__v8hi
)__A
, (__v8hi
)__B
);
989 static __inline __m128i
990 _mm_packs_epi32 (__m128i __A
, __m128i __B
)
992 return (__m128i
)__builtin_ia32_packssdw128 ((__v4si
)__A
, (__v4si
)__B
);
995 static __inline __m128i
996 _mm_packus_epi16 (__m128i __A
, __m128i __B
)
998 return (__m128i
)__builtin_ia32_packuswb128 ((__v8hi
)__A
, (__v8hi
)__B
);
1001 static __inline __m128i
1002 _mm_unpackhi_epi8 (__m128i __A
, __m128i __B
)
1004 return (__m128i
)__builtin_ia32_punpckhbw128 ((__v16qi
)__A
, (__v16qi
)__B
);
1007 static __inline __m128i
1008 _mm_unpackhi_epi16 (__m128i __A
, __m128i __B
)
1010 return (__m128i
)__builtin_ia32_punpckhwd128 ((__v8hi
)__A
, (__v8hi
)__B
);
1013 static __inline __m128i
1014 _mm_unpackhi_epi32 (__m128i __A
, __m128i __B
)
1016 return (__m128i
)__builtin_ia32_punpckhdq128 ((__v4si
)__A
, (__v4si
)__B
);
1019 static __inline __m128i
1020 _mm_unpackhi_epi64 (__m128i __A
, __m128i __B
)
1022 return (__m128i
)__builtin_ia32_punpckhqdq128 ((__v2di
)__A
, (__v2di
)__B
);
1025 static __inline __m128i
1026 _mm_unpacklo_epi8 (__m128i __A
, __m128i __B
)
1028 return (__m128i
)__builtin_ia32_punpcklbw128 ((__v16qi
)__A
, (__v16qi
)__B
);
1031 static __inline __m128i
1032 _mm_unpacklo_epi16 (__m128i __A
, __m128i __B
)
1034 return (__m128i
)__builtin_ia32_punpcklwd128 ((__v8hi
)__A
, (__v8hi
)__B
);
1037 static __inline __m128i
1038 _mm_unpacklo_epi32 (__m128i __A
, __m128i __B
)
1040 return (__m128i
)__builtin_ia32_punpckldq128 ((__v4si
)__A
, (__v4si
)__B
);
1043 static __inline __m128i
1044 _mm_unpacklo_epi64 (__m128i __A
, __m128i __B
)
1046 return (__m128i
)__builtin_ia32_punpcklqdq128 ((__v2di
)__A
, (__v2di
)__B
);
1049 static __inline __m128i
1050 _mm_add_epi8 (__m128i __A
, __m128i __B
)
1052 return (__m128i
)__builtin_ia32_paddb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1055 static __inline __m128i
1056 _mm_add_epi16 (__m128i __A
, __m128i __B
)
1058 return (__m128i
)__builtin_ia32_paddw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1061 static __inline __m128i
1062 _mm_add_epi32 (__m128i __A
, __m128i __B
)
1064 return (__m128i
)__builtin_ia32_paddd128 ((__v4si
)__A
, (__v4si
)__B
);
1067 static __inline __m128i
1068 _mm_add_epi64 (__m128i __A
, __m128i __B
)
1070 return (__m128i
)__builtin_ia32_paddq128 ((__v2di
)__A
, (__v2di
)__B
);
1073 static __inline __m128i
1074 _mm_adds_epi8 (__m128i __A
, __m128i __B
)
1076 return (__m128i
)__builtin_ia32_paddsb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1079 static __inline __m128i
1080 _mm_adds_epi16 (__m128i __A
, __m128i __B
)
1082 return (__m128i
)__builtin_ia32_paddsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1085 static __inline __m128i
1086 _mm_adds_epu8 (__m128i __A
, __m128i __B
)
1088 return (__m128i
)__builtin_ia32_paddusb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1091 static __inline __m128i
1092 _mm_adds_epu16 (__m128i __A
, __m128i __B
)
1094 return (__m128i
)__builtin_ia32_paddusw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1097 static __inline __m128i
1098 _mm_sub_epi8 (__m128i __A
, __m128i __B
)
1100 return (__m128i
)__builtin_ia32_psubb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1103 static __inline __m128i
1104 _mm_sub_epi16 (__m128i __A
, __m128i __B
)
1106 return (__m128i
)__builtin_ia32_psubw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1109 static __inline __m128i
1110 _mm_sub_epi32 (__m128i __A
, __m128i __B
)
1112 return (__m128i
)__builtin_ia32_psubd128 ((__v4si
)__A
, (__v4si
)__B
);
1115 static __inline __m128i
1116 _mm_sub_epi64 (__m128i __A
, __m128i __B
)
1118 return (__m128i
)__builtin_ia32_psubq128 ((__v2di
)__A
, (__v2di
)__B
);
1121 static __inline __m128i
1122 _mm_subs_epi8 (__m128i __A
, __m128i __B
)
1124 return (__m128i
)__builtin_ia32_psubsb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1127 static __inline __m128i
1128 _mm_subs_epi16 (__m128i __A
, __m128i __B
)
1130 return (__m128i
)__builtin_ia32_psubsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1133 static __inline __m128i
1134 _mm_subs_epu8 (__m128i __A
, __m128i __B
)
1136 return (__m128i
)__builtin_ia32_psubusb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1139 static __inline __m128i
1140 _mm_subs_epu16 (__m128i __A
, __m128i __B
)
1142 return (__m128i
)__builtin_ia32_psubusw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1145 static __inline __m128i
1146 _mm_madd_epi16 (__m128i __A
, __m128i __B
)
1148 return (__m128i
)__builtin_ia32_pmaddwd128 ((__v8hi
)__A
, (__v8hi
)__B
);
1151 static __inline __m128i
1152 _mm_mulhi_epi16 (__m128i __A
, __m128i __B
)
1154 return (__m128i
)__builtin_ia32_pmulhw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1157 static __inline __m128i
1158 _mm_mullo_epi16 (__m128i __A
, __m128i __B
)
1160 return (__m128i
)__builtin_ia32_pmullw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1163 static __inline __m64
1164 _mm_mul_su32 (__m64 __A
, __m64 __B
)
1166 return (__m64
)__builtin_ia32_pmuludq ((__v2si
)__A
, (__v2si
)__B
);
1169 static __inline __m128i
1170 _mm_mul_epu32 (__m128i __A
, __m128i __B
)
1172 return (__m128i
)__builtin_ia32_pmuludq128 ((__v4si
)__A
, (__v4si
)__B
);
1175 static __inline __m128i
1176 _mm_slli_epi16 (__m128i __A
, int __B
)
1178 return (__m128i
)__builtin_ia32_psllwi128 ((__v8hi
)__A
, __B
);
1181 static __inline __m128i
1182 _mm_slli_epi32 (__m128i __A
, int __B
)
1184 return (__m128i
)__builtin_ia32_pslldi128 ((__v4si
)__A
, __B
);
1187 static __inline __m128i
1188 _mm_slli_epi64 (__m128i __A
, int __B
)
1190 return (__m128i
)__builtin_ia32_psllqi128 ((__v2di
)__A
, __B
);
1193 static __inline __m128i
1194 _mm_srai_epi16 (__m128i __A
, int __B
)
1196 return (__m128i
)__builtin_ia32_psrawi128 ((__v8hi
)__A
, __B
);
1199 static __inline __m128i
1200 _mm_srai_epi32 (__m128i __A
, int __B
)
1202 return (__m128i
)__builtin_ia32_psradi128 ((__v4si
)__A
, __B
);
1206 static __m128i
__attribute__((__always_inline__
))
1207 _mm_srli_si128 (__m128i __A
, const int __B
)
1209 return ((__m128i
)__builtin_ia32_psrldqi128 (__A
, __B
))
1212 static __m128i
__attribute__((__always_inline__
))
1213 _mm_srli_si128 (__m128i __A
, const int __B
)
1215 return ((__m128i
)__builtin_ia32_pslldqi128 (__A
, __B
))
1218 #define _mm_srli_si128(__A, __B) \
1219 ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8))
1220 #define _mm_slli_si128(__A, __B) \
1221 ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
1224 static __inline __m128i
1225 _mm_srli_epi16 (__m128i __A
, int __B
)
1227 return (__m128i
)__builtin_ia32_psrlwi128 ((__v8hi
)__A
, __B
);
1230 static __inline __m128i
1231 _mm_srli_epi32 (__m128i __A
, int __B
)
1233 return (__m128i
)__builtin_ia32_psrldi128 ((__v4si
)__A
, __B
);
1236 static __inline __m128i
1237 _mm_srli_epi64 (__m128i __A
, int __B
)
1239 return (__m128i
)__builtin_ia32_psrlqi128 ((__v2di
)__A
, __B
);
1242 static __inline __m128i
1243 _mm_sll_epi16 (__m128i __A
, __m128i __B
)
1245 return _mm_slli_epi16 (__A
, _mm_cvtsi128_si32 (__B
));
1248 static __inline __m128i
1249 _mm_sll_epi32 (__m128i __A
, __m128i __B
)
1251 return _mm_slli_epi32 (__A
, _mm_cvtsi128_si32 (__B
));
1254 static __inline __m128i
1255 _mm_sll_epi64 (__m128i __A
, __m128i __B
)
1257 return _mm_slli_epi64 (__A
, _mm_cvtsi128_si32 (__B
));
1260 static __inline __m128i
1261 _mm_sra_epi16 (__m128i __A
, __m128i __B
)
1263 return _mm_srai_epi16 (__A
, _mm_cvtsi128_si32 (__B
));
1266 static __inline __m128i
1267 _mm_sra_epi32 (__m128i __A
, __m128i __B
)
1269 return _mm_srai_epi32 (__A
, _mm_cvtsi128_si32 (__B
));
1272 static __inline __m128i
1273 _mm_srl_epi16 (__m128i __A
, __m128i __B
)
1275 return _mm_srli_epi16 (__A
, _mm_cvtsi128_si32 (__B
));
1278 static __inline __m128i
1279 _mm_srl_epi32 (__m128i __A
, __m128i __B
)
1281 return _mm_srli_epi32 (__A
, _mm_cvtsi128_si32 (__B
));
1284 static __inline __m128i
1285 _mm_srl_epi64 (__m128i __A
, __m128i __B
)
1287 return _mm_srli_epi64 (__A
, _mm_cvtsi128_si32 (__B
));
1290 static __inline __m128i
1291 _mm_and_si128 (__m128i __A
, __m128i __B
)
1293 return (__m128i
)__builtin_ia32_pand128 ((__v2di
)__A
, (__v2di
)__B
);
1296 static __inline __m128i
1297 _mm_andnot_si128 (__m128i __A
, __m128i __B
)
1299 return (__m128i
)__builtin_ia32_pandn128 ((__v2di
)__A
, (__v2di
)__B
);
1302 static __inline __m128i
1303 _mm_or_si128 (__m128i __A
, __m128i __B
)
1305 return (__m128i
)__builtin_ia32_por128 ((__v2di
)__A
, (__v2di
)__B
);
1308 static __inline __m128i
1309 _mm_xor_si128 (__m128i __A
, __m128i __B
)
1311 return (__m128i
)__builtin_ia32_pxor128 ((__v2di
)__A
, (__v2di
)__B
);
1314 static __inline __m128i
1315 _mm_cmpeq_epi8 (__m128i __A
, __m128i __B
)
1317 return (__m128i
)__builtin_ia32_pcmpeqb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1320 static __inline __m128i
1321 _mm_cmpeq_epi16 (__m128i __A
, __m128i __B
)
1323 return (__m128i
)__builtin_ia32_pcmpeqw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1326 static __inline __m128i
1327 _mm_cmpeq_epi32 (__m128i __A
, __m128i __B
)
1329 return (__m128i
)__builtin_ia32_pcmpeqd128 ((__v4si
)__A
, (__v4si
)__B
);
1332 static __inline __m128i
1333 _mm_cmplt_epi8 (__m128i __A
, __m128i __B
)
1335 return (__m128i
)__builtin_ia32_pcmpgtb128 ((__v16qi
)__B
, (__v16qi
)__A
);
1338 static __inline __m128i
1339 _mm_cmplt_epi16 (__m128i __A
, __m128i __B
)
1341 return (__m128i
)__builtin_ia32_pcmpgtw128 ((__v8hi
)__B
, (__v8hi
)__A
);
1344 static __inline __m128i
1345 _mm_cmplt_epi32 (__m128i __A
, __m128i __B
)
1347 return (__m128i
)__builtin_ia32_pcmpgtd128 ((__v4si
)__B
, (__v4si
)__A
);
1350 static __inline __m128i
1351 _mm_cmpgt_epi8 (__m128i __A
, __m128i __B
)
1353 return (__m128i
)__builtin_ia32_pcmpgtb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1356 static __inline __m128i
1357 _mm_cmpgt_epi16 (__m128i __A
, __m128i __B
)
1359 return (__m128i
)__builtin_ia32_pcmpgtw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1362 static __inline __m128i
1363 _mm_cmpgt_epi32 (__m128i __A
, __m128i __B
)
1365 return (__m128i
)__builtin_ia32_pcmpgtd128 ((__v4si
)__A
, (__v4si
)__B
);
1368 #define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
1370 #define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C))
1372 static __inline __m128i
1373 _mm_max_epi16 (__m128i __A
, __m128i __B
)
1375 return (__m128i
)__builtin_ia32_pmaxsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1378 static __inline __m128i
1379 _mm_max_epu8 (__m128i __A
, __m128i __B
)
1381 return (__m128i
)__builtin_ia32_pmaxub128 ((__v16qi
)__A
, (__v16qi
)__B
);
1384 static __inline __m128i
1385 _mm_min_epi16 (__m128i __A
, __m128i __B
)
1387 return (__m128i
)__builtin_ia32_pminsw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1390 static __inline __m128i
1391 _mm_min_epu8 (__m128i __A
, __m128i __B
)
1393 return (__m128i
)__builtin_ia32_pminub128 ((__v16qi
)__A
, (__v16qi
)__B
);
1397 _mm_movemask_epi8 (__m128i __A
)
1399 return __builtin_ia32_pmovmskb128 ((__v16qi
)__A
);
1402 static __inline __m128i
1403 _mm_mulhi_epu16 (__m128i __A
, __m128i __B
)
1405 return (__m128i
)__builtin_ia32_pmulhuw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1408 #define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1409 #define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1410 #define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1412 static __inline
void
1413 _mm_maskmoveu_si128 (__m128i __A
, __m128i __B
, char *__C
)
1415 __builtin_ia32_maskmovdqu ((__v16qi
)__A
, (__v16qi
)__B
, __C
);
1418 static __inline __m128i
1419 _mm_avg_epu8 (__m128i __A
, __m128i __B
)
1421 return (__m128i
)__builtin_ia32_pavgb128 ((__v16qi
)__A
, (__v16qi
)__B
);
1424 static __inline __m128i
1425 _mm_avg_epu16 (__m128i __A
, __m128i __B
)
1427 return (__m128i
)__builtin_ia32_pavgw128 ((__v8hi
)__A
, (__v8hi
)__B
);
1430 static __inline __m128i
1431 _mm_sad_epu8 (__m128i __A
, __m128i __B
)
1433 return (__m128i
)__builtin_ia32_psadbw128 ((__v16qi
)__A
, (__v16qi
)__B
);
1436 static __inline
void
1437 _mm_stream_si32 (int *__A
, int __B
)
1439 __builtin_ia32_movnti (__A
, __B
);
1442 static __inline
void
1443 _mm_stream_si128 (__m128i
*__A
, __m128i __B
)
1445 __builtin_ia32_movntdq ((__v2di
*)__A
, (__v2di
)__B
);
1448 static __inline
void
1449 _mm_stream_pd (double *__A
, __m128d __B
)
1451 __builtin_ia32_movntpd (__A
, (__v2df
)__B
);
1454 static __inline __m128i
1455 _mm_movpi64_epi64 (__m64 __A
)
1457 return (__m128i
)__builtin_ia32_movq2dq ((unsigned long long)__A
);
1460 static __inline
void
1461 _mm_clflush (void const *__A
)
1463 return __builtin_ia32_clflush (__A
);
1466 static __inline
void
1469 __builtin_ia32_lfence ();
1472 static __inline
void
1475 __builtin_ia32_mfence ();
1478 static __inline __m128i
1479 _mm_cvtsi32_si128 (int __A
)
1481 return (__m128i
) __builtin_ia32_loadd (&__A
);
1485 static __inline __m128i
1486 _mm_cvtsi64x_si128 (long long __A
)
1488 return (__m128i
) __builtin_ia32_movq2dq (__A
);
1492 #endif /* __SSE2__ */
1494 #endif /* _EMMINTRIN_H_INCLUDED */