emmintrin.h (_mm_cvtsi128_si32): Move earlier.
[gcc.git] / gcc / config / i386 / emmintrin.h
1 /* Copyright (C) 2003, 2004 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING. If not, write to
17 the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20 /* As a special exception, if you include this header file into source
21 files compiled by GCC, this header file does not by itself cause
22 the resulting executable to be covered by the GNU General Public
23 License. This exception does not however invalidate any other
24 reasons why the executable file might be covered by the GNU General
25 Public License. */
26
27 /* Implemented from the specification included in the Intel C++ Compiler
28 User Guide and Reference, version 8.0. */
29
30 #ifndef _EMMINTRIN_H_INCLUDED
31 #define _EMMINTRIN_H_INCLUDED
32
33 #ifdef __SSE2__
34 #include <xmmintrin.h>
35
36 /* SSE2 */
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef int __v4si __attribute__ ((__vector_size__ (16)));
40 typedef short __v8hi __attribute__ ((__vector_size__ (16)));
41 typedef char __v16qi __attribute__ ((__vector_size__ (16)));
42
43 /* Create a selector for use with the SHUFPD instruction. */
44 #define _MM_SHUFFLE2(fp1,fp0) \
45 (((fp1) << 1) | (fp0))
46
47 #define __m128i __v2di
48 #define __m128d __v2df
49
50 /* Create a vector with element 0 as *P and the rest zero. */
51 static __inline __m128d
52 _mm_load_sd (double const *__P)
53 {
54 return (__m128d) __builtin_ia32_loadsd (__P);
55 }
56
57 /* Create a vector with all two elements equal to *P. */
58 static __inline __m128d
59 _mm_load1_pd (double const *__P)
60 {
61 __v2df __tmp = __builtin_ia32_loadsd (__P);
62 return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
63 }
64
65 static __inline __m128d
66 _mm_load_pd1 (double const *__P)
67 {
68 return _mm_load1_pd (__P);
69 }
70
71 /* Load two DPFP values from P. The address must be 16-byte aligned. */
72 static __inline __m128d
73 _mm_load_pd (double const *__P)
74 {
75 return (__m128d) __builtin_ia32_loadapd (__P);
76 }
77
78 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
79 static __inline __m128d
80 _mm_loadu_pd (double const *__P)
81 {
82 return (__m128d) __builtin_ia32_loadupd (__P);
83 }
84
85 /* Load two DPFP values in reverse order. The address must be aligned. */
86 static __inline __m128d
87 _mm_loadr_pd (double const *__P)
88 {
89 __v2df __tmp = __builtin_ia32_loadapd (__P);
90 return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
91 }
92
93 /* Create a vector with element 0 as F and the rest zero. */
94 static __inline __m128d
95 _mm_set_sd (double __F)
96 {
97 return (__m128d) __builtin_ia32_loadsd (&__F);
98 }
99
100 /* Create a vector with all two elements equal to F. */
101 static __inline __m128d
102 _mm_set1_pd (double __F)
103 {
104 __v2df __tmp = __builtin_ia32_loadsd (&__F);
105 return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
106 }
107
108 static __inline __m128d
109 _mm_set_pd1 (double __F)
110 {
111 return _mm_set1_pd (__F);
112 }
113
114 /* Create the vector [Z Y]. */
115 static __inline __m128d
116 _mm_set_pd (double __Z, double __Y)
117 {
118 return (__v2df) {__Y, __Z};
119 }
120
121 /* Create the vector [Y Z]. */
122 static __inline __m128d
123 _mm_setr_pd (double __Z, double __Y)
124 {
125 return _mm_set_pd (__Y, __Z);
126 }
127
128 /* Create a vector of zeros. */
129 static __inline __m128d
130 _mm_setzero_pd (void)
131 {
132 return (__m128d) __builtin_ia32_setzeropd ();
133 }
134
135 /* Stores the lower DPFP value. */
136 static __inline void
137 _mm_store_sd (double *__P, __m128d __A)
138 {
139 __builtin_ia32_storesd (__P, (__v2df)__A);
140 }
141
142 /* Store the lower DPFP value across two words. */
143 static __inline void
144 _mm_store1_pd (double *__P, __m128d __A)
145 {
146 __v2df __va = (__v2df)__A;
147 __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,0));
148 __builtin_ia32_storeapd (__P, __tmp);
149 }
150
151 static __inline void
152 _mm_store_pd1 (double *__P, __m128d __A)
153 {
154 _mm_store1_pd (__P, __A);
155 }
156
157 /* Store two DPFP values. The address must be 16-byte aligned. */
158 static __inline void
159 _mm_store_pd (double *__P, __m128d __A)
160 {
161 __builtin_ia32_storeapd (__P, (__v2df)__A);
162 }
163
164 /* Store two DPFP values. The address need not be 16-byte aligned. */
165 static __inline void
166 _mm_storeu_pd (double *__P, __m128d __A)
167 {
168 __builtin_ia32_storeupd (__P, (__v2df)__A);
169 }
170
171 /* Store two DPFP values in reverse order. The address must be aligned. */
172 static __inline void
173 _mm_storer_pd (double *__P, __m128d __A)
174 {
175 __v2df __va = (__v2df)__A;
176 __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,1));
177 __builtin_ia32_storeapd (__P, __tmp);
178 }
179
180 static __inline int
181 _mm_cvtsi128_si32 (__m128i __A)
182 {
183 int __tmp;
184 __builtin_ia32_stored (&__tmp, (__v4si)__A);
185 return __tmp;
186 }
187
188 #ifdef __x86_64__
189 static __inline long long
190 _mm_cvtsi128_si64x (__m128i __A)
191 {
192 return __builtin_ia32_movdq2q ((__v2di)__A);
193 }
194 #endif
195
196 /* Sets the low DPFP value of A from the low value of B. */
197 static __inline __m128d
198 _mm_move_sd (__m128d __A, __m128d __B)
199 {
200 return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
201 }
202
203
204 static __inline __m128d
205 _mm_add_pd (__m128d __A, __m128d __B)
206 {
207 return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
208 }
209
210 static __inline __m128d
211 _mm_add_sd (__m128d __A, __m128d __B)
212 {
213 return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
214 }
215
216 static __inline __m128d
217 _mm_sub_pd (__m128d __A, __m128d __B)
218 {
219 return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
220 }
221
222 static __inline __m128d
223 _mm_sub_sd (__m128d __A, __m128d __B)
224 {
225 return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
226 }
227
228 static __inline __m128d
229 _mm_mul_pd (__m128d __A, __m128d __B)
230 {
231 return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
232 }
233
234 static __inline __m128d
235 _mm_mul_sd (__m128d __A, __m128d __B)
236 {
237 return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
238 }
239
240 static __inline __m128d
241 _mm_div_pd (__m128d __A, __m128d __B)
242 {
243 return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
244 }
245
246 static __inline __m128d
247 _mm_div_sd (__m128d __A, __m128d __B)
248 {
249 return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
250 }
251
252 static __inline __m128d
253 _mm_sqrt_pd (__m128d __A)
254 {
255 return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
256 }
257
258 /* Return pair {sqrt (A[0), B[1]}. */
259 static __inline __m128d
260 _mm_sqrt_sd (__m128d __A, __m128d __B)
261 {
262 __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
263 return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
264 }
265
266 static __inline __m128d
267 _mm_min_pd (__m128d __A, __m128d __B)
268 {
269 return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
270 }
271
272 static __inline __m128d
273 _mm_min_sd (__m128d __A, __m128d __B)
274 {
275 return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
276 }
277
278 static __inline __m128d
279 _mm_max_pd (__m128d __A, __m128d __B)
280 {
281 return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
282 }
283
284 static __inline __m128d
285 _mm_max_sd (__m128d __A, __m128d __B)
286 {
287 return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
288 }
289
290 static __inline __m128d
291 _mm_and_pd (__m128d __A, __m128d __B)
292 {
293 return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
294 }
295
296 static __inline __m128d
297 _mm_andnot_pd (__m128d __A, __m128d __B)
298 {
299 return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
300 }
301
302 static __inline __m128d
303 _mm_or_pd (__m128d __A, __m128d __B)
304 {
305 return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
306 }
307
308 static __inline __m128d
309 _mm_xor_pd (__m128d __A, __m128d __B)
310 {
311 return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
312 }
313
314 static __inline __m128d
315 _mm_cmpeq_pd (__m128d __A, __m128d __B)
316 {
317 return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
318 }
319
320 static __inline __m128d
321 _mm_cmplt_pd (__m128d __A, __m128d __B)
322 {
323 return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
324 }
325
326 static __inline __m128d
327 _mm_cmple_pd (__m128d __A, __m128d __B)
328 {
329 return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
330 }
331
332 static __inline __m128d
333 _mm_cmpgt_pd (__m128d __A, __m128d __B)
334 {
335 return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
336 }
337
338 static __inline __m128d
339 _mm_cmpge_pd (__m128d __A, __m128d __B)
340 {
341 return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
342 }
343
344 static __inline __m128d
345 _mm_cmpneq_pd (__m128d __A, __m128d __B)
346 {
347 return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
348 }
349
350 static __inline __m128d
351 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
352 {
353 return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
354 }
355
356 static __inline __m128d
357 _mm_cmpnle_pd (__m128d __A, __m128d __B)
358 {
359 return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
360 }
361
362 static __inline __m128d
363 _mm_cmpngt_pd (__m128d __A, __m128d __B)
364 {
365 return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
366 }
367
368 static __inline __m128d
369 _mm_cmpnge_pd (__m128d __A, __m128d __B)
370 {
371 return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
372 }
373
374 static __inline __m128d
375 _mm_cmpord_pd (__m128d __A, __m128d __B)
376 {
377 return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
378 }
379
380 static __inline __m128d
381 _mm_cmpunord_pd (__m128d __A, __m128d __B)
382 {
383 return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
384 }
385
386 static __inline __m128d
387 _mm_cmpeq_sd (__m128d __A, __m128d __B)
388 {
389 return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
390 }
391
392 static __inline __m128d
393 _mm_cmplt_sd (__m128d __A, __m128d __B)
394 {
395 return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
396 }
397
398 static __inline __m128d
399 _mm_cmple_sd (__m128d __A, __m128d __B)
400 {
401 return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
402 }
403
404 static __inline __m128d
405 _mm_cmpgt_sd (__m128d __A, __m128d __B)
406 {
407 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
408 (__v2df)
409 __builtin_ia32_cmpltsd ((__v2df) __B,
410 (__v2df)
411 __A));
412 }
413
414 static __inline __m128d
415 _mm_cmpge_sd (__m128d __A, __m128d __B)
416 {
417 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
418 (__v2df)
419 __builtin_ia32_cmplesd ((__v2df) __B,
420 (__v2df)
421 __A));
422 }
423
424 static __inline __m128d
425 _mm_cmpneq_sd (__m128d __A, __m128d __B)
426 {
427 return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
428 }
429
430 static __inline __m128d
431 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
432 {
433 return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
434 }
435
436 static __inline __m128d
437 _mm_cmpnle_sd (__m128d __A, __m128d __B)
438 {
439 return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
440 }
441
442 static __inline __m128d
443 _mm_cmpngt_sd (__m128d __A, __m128d __B)
444 {
445 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
446 (__v2df)
447 __builtin_ia32_cmpnltsd ((__v2df) __B,
448 (__v2df)
449 __A));
450 }
451
452 static __inline __m128d
453 _mm_cmpnge_sd (__m128d __A, __m128d __B)
454 {
455 return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
456 (__v2df)
457 __builtin_ia32_cmpnlesd ((__v2df) __B,
458 (__v2df)
459 __A));
460 }
461
462 static __inline __m128d
463 _mm_cmpord_sd (__m128d __A, __m128d __B)
464 {
465 return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
466 }
467
468 static __inline __m128d
469 _mm_cmpunord_sd (__m128d __A, __m128d __B)
470 {
471 return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
472 }
473
474 static __inline int
475 _mm_comieq_sd (__m128d __A, __m128d __B)
476 {
477 return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
478 }
479
480 static __inline int
481 _mm_comilt_sd (__m128d __A, __m128d __B)
482 {
483 return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
484 }
485
486 static __inline int
487 _mm_comile_sd (__m128d __A, __m128d __B)
488 {
489 return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
490 }
491
492 static __inline int
493 _mm_comigt_sd (__m128d __A, __m128d __B)
494 {
495 return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
496 }
497
498 static __inline int
499 _mm_comige_sd (__m128d __A, __m128d __B)
500 {
501 return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
502 }
503
504 static __inline int
505 _mm_comineq_sd (__m128d __A, __m128d __B)
506 {
507 return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
508 }
509
510 static __inline int
511 _mm_ucomieq_sd (__m128d __A, __m128d __B)
512 {
513 return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
514 }
515
516 static __inline int
517 _mm_ucomilt_sd (__m128d __A, __m128d __B)
518 {
519 return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
520 }
521
522 static __inline int
523 _mm_ucomile_sd (__m128d __A, __m128d __B)
524 {
525 return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
526 }
527
528 static __inline int
529 _mm_ucomigt_sd (__m128d __A, __m128d __B)
530 {
531 return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
532 }
533
534 static __inline int
535 _mm_ucomige_sd (__m128d __A, __m128d __B)
536 {
537 return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
538 }
539
540 static __inline int
541 _mm_ucomineq_sd (__m128d __A, __m128d __B)
542 {
543 return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
544 }
545
546 /* Create a vector with element 0 as *P and the rest zero. */
547
548 static __inline __m128i
549 _mm_load_si128 (__m128i const *__P)
550 {
551 return (__m128i) __builtin_ia32_loaddqa ((char const *)__P);
552 }
553
554 static __inline __m128i
555 _mm_loadu_si128 (__m128i const *__P)
556 {
557 return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
558 }
559
560 static __inline __m128i
561 _mm_loadl_epi64 (__m128i const *__P)
562 {
563 return (__m128i) __builtin_ia32_movq2dq (*(unsigned long long *)__P);
564 }
565
566 static __inline void
567 _mm_store_si128 (__m128i *__P, __m128i __B)
568 {
569 __builtin_ia32_storedqa ((char *)__P, (__v16qi)__B);
570 }
571
572 static __inline void
573 _mm_storeu_si128 (__m128i *__P, __m128i __B)
574 {
575 __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
576 }
577
578 static __inline void
579 _mm_storel_epi64 (__m128i *__P, __m128i __B)
580 {
581 *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B);
582 }
583
584 static __inline __m64
585 _mm_movepi64_pi64 (__m128i __B)
586 {
587 return (__m64) __builtin_ia32_movdq2q ((__v2di)__B);
588 }
589
590 static __inline __m128i
591 _mm_move_epi64 (__m128i __A)
592 {
593 return (__m128i) __builtin_ia32_movq ((__v2di)__A);
594 }
595
596 /* Create a vector of zeros. */
597 static __inline __m128i
598 _mm_setzero_si128 (void)
599 {
600 return (__m128i) __builtin_ia32_setzero128 ();
601 }
602
603 static __inline __m128i
604 _mm_set_epi64 (__m64 __A, __m64 __B)
605 {
606 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
607 __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
608 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp2, __tmp);
609 }
610
611 /* Create the vector [Z Y X W]. */
612 static __inline __m128i
613 _mm_set_epi32 (int __Z, int __Y, int __X, int __W)
614 {
615 union {
616 int __a[4];
617 __m128i __v;
618 } __u;
619
620 __u.__a[0] = __W;
621 __u.__a[1] = __X;
622 __u.__a[2] = __Y;
623 __u.__a[3] = __Z;
624
625 return __u.__v;
626 }
627
628 #ifdef __x86_64__
629 /* Create the vector [Z Y]. */
630 static __inline __m128i
631 _mm_set_epi64x (long long __Z, long long __Y)
632 {
633 union {
634 long __a[2];
635 __m128i __v;
636 } __u;
637
638 __u.__a[0] = __Y;
639 __u.__a[1] = __Z;
640
641 return __u.__v;
642 }
643 #endif
644
645 /* Create the vector [S T U V Z Y X W]. */
646 static __inline __m128i
647 _mm_set_epi16 (short __Z, short __Y, short __X, short __W,
648 short __V, short __U, short __T, short __S)
649 {
650 union {
651 short __a[8];
652 __m128i __v;
653 } __u;
654
655 __u.__a[0] = __S;
656 __u.__a[1] = __T;
657 __u.__a[2] = __U;
658 __u.__a[3] = __V;
659 __u.__a[4] = __W;
660 __u.__a[5] = __X;
661 __u.__a[6] = __Y;
662 __u.__a[7] = __Z;
663
664 return __u.__v;
665 }
666
667 /* Create the vector [S T U V Z Y X W]. */
668 static __inline __m128i
669 _mm_set_epi8 (char __Z, char __Y, char __X, char __W,
670 char __V, char __U, char __T, char __S,
671 char __Z1, char __Y1, char __X1, char __W1,
672 char __V1, char __U1, char __T1, char __S1)
673 {
674 union {
675 char __a[16];
676 __m128i __v;
677 } __u;
678
679 __u.__a[0] = __S1;
680 __u.__a[1] = __T1;
681 __u.__a[2] = __U1;
682 __u.__a[3] = __V1;
683 __u.__a[4] = __W1;
684 __u.__a[5] = __X1;
685 __u.__a[6] = __Y1;
686 __u.__a[7] = __Z1;
687 __u.__a[8] = __S;
688 __u.__a[9] = __T;
689 __u.__a[10] = __U;
690 __u.__a[11] = __V;
691 __u.__a[12] = __W;
692 __u.__a[13] = __X;
693 __u.__a[14] = __Y;
694 __u.__a[15] = __Z;
695
696 return __u.__v;
697 }
698
699 static __inline __m128i
700 _mm_set1_epi64 (__m64 __A)
701 {
702 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
703 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp);
704 }
705
706 static __inline __m128i
707 _mm_set1_epi32 (int __A)
708 {
709 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__A);
710 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
711 }
712
713 #ifdef __x86_64__
714 static __inline __m128i
715 _mm_set1_epi64x (long long __A)
716 {
717 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
718 return (__m128i) __builtin_ia32_shufpd ((__v2df)__tmp, (__v2df)__tmp, _MM_SHUFFLE2 (0,0));
719 }
720 #endif
721
722 static __inline __m128i
723 _mm_set1_epi16 (short __A)
724 {
725 int __Acopy = (unsigned short)__A;
726 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
727 __tmp = (__v4si)__builtin_ia32_punpcklwd128 ((__v8hi)__tmp, (__v8hi)__tmp);
728 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
729 }
730
731 static __inline __m128i
732 _mm_set1_epi8 (char __A)
733 {
734 int __Acopy = (unsigned char)__A;
735 __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
736 __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
737 __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
738 return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
739 }
740
741 static __inline __m128i
742 _mm_setr_epi64 (__m64 __A, __m64 __B)
743 {
744 __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
745 __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
746 return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp2);
747 }
748
749 /* Create the vector [Z Y X W]. */
750 static __inline __m128i
751 _mm_setr_epi32 (int __W, int __X, int __Y, int __Z)
752 {
753 union {
754 int __a[4];
755 __m128i __v;
756 } __u;
757
758 __u.__a[0] = __W;
759 __u.__a[1] = __X;
760 __u.__a[2] = __Y;
761 __u.__a[3] = __Z;
762
763 return __u.__v;
764 }
765 /* Create the vector [S T U V Z Y X W]. */
766 static __inline __m128i
767 _mm_setr_epi16 (short __S, short __T, short __U, short __V,
768 short __W, short __X, short __Y, short __Z)
769 {
770 union {
771 short __a[8];
772 __m128i __v;
773 } __u;
774
775 __u.__a[0] = __S;
776 __u.__a[1] = __T;
777 __u.__a[2] = __U;
778 __u.__a[3] = __V;
779 __u.__a[4] = __W;
780 __u.__a[5] = __X;
781 __u.__a[6] = __Y;
782 __u.__a[7] = __Z;
783
784 return __u.__v;
785 }
786
787 /* Create the vector [S T U V Z Y X W]. */
788 static __inline __m128i
789 _mm_setr_epi8 (char __S1, char __T1, char __U1, char __V1,
790 char __W1, char __X1, char __Y1, char __Z1,
791 char __S, char __T, char __U, char __V,
792 char __W, char __X, char __Y, char __Z)
793 {
794 union {
795 char __a[16];
796 __m128i __v;
797 } __u;
798
799 __u.__a[0] = __S1;
800 __u.__a[1] = __T1;
801 __u.__a[2] = __U1;
802 __u.__a[3] = __V1;
803 __u.__a[4] = __W1;
804 __u.__a[5] = __X1;
805 __u.__a[6] = __Y1;
806 __u.__a[7] = __Z1;
807 __u.__a[8] = __S;
808 __u.__a[9] = __T;
809 __u.__a[10] = __U;
810 __u.__a[11] = __V;
811 __u.__a[12] = __W;
812 __u.__a[13] = __X;
813 __u.__a[14] = __Y;
814 __u.__a[15] = __Z;
815
816 return __u.__v;
817 }
818
819 static __inline __m128d
820 _mm_cvtepi32_pd (__m128i __A)
821 {
822 return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
823 }
824
825 static __inline __m128
826 _mm_cvtepi32_ps (__m128i __A)
827 {
828 return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
829 }
830
831 static __inline __m128i
832 _mm_cvtpd_epi32 (__m128d __A)
833 {
834 return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
835 }
836
837 static __inline __m64
838 _mm_cvtpd_pi32 (__m128d __A)
839 {
840 return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
841 }
842
843 static __inline __m128
844 _mm_cvtpd_ps (__m128d __A)
845 {
846 return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
847 }
848
849 static __inline __m128i
850 _mm_cvttpd_epi32 (__m128d __A)
851 {
852 return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
853 }
854
855 static __inline __m64
856 _mm_cvttpd_pi32 (__m128d __A)
857 {
858 return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
859 }
860
861 static __inline __m128d
862 _mm_cvtpi32_pd (__m64 __A)
863 {
864 return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
865 }
866
867 static __inline __m128i
868 _mm_cvtps_epi32 (__m128 __A)
869 {
870 return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
871 }
872
873 static __inline __m128i
874 _mm_cvttps_epi32 (__m128 __A)
875 {
876 return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
877 }
878
879 static __inline __m128d
880 _mm_cvtps_pd (__m128 __A)
881 {
882 return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
883 }
884
885 static __inline int
886 _mm_cvtsd_si32 (__m128d __A)
887 {
888 return __builtin_ia32_cvtsd2si ((__v2df) __A);
889 }
890
891 #ifdef __x86_64__
892 static __inline long long
893 _mm_cvtsd_si64x (__m128d __A)
894 {
895 return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
896 }
897 #endif
898
899 static __inline int
900 _mm_cvttsd_si32 (__m128d __A)
901 {
902 return __builtin_ia32_cvttsd2si ((__v2df) __A);
903 }
904
905 #ifdef __x86_64__
906 static __inline long long
907 _mm_cvttsd_si64x (__m128d __A)
908 {
909 return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
910 }
911 #endif
912
913 static __inline __m128
914 _mm_cvtsd_ss (__m128 __A, __m128d __B)
915 {
916 return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
917 }
918
919 static __inline __m128d
920 _mm_cvtsi32_sd (__m128d __A, int __B)
921 {
922 return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
923 }
924
925 #ifdef __x86_64__
926 static __inline __m128d
927 _mm_cvtsi64x_sd (__m128d __A, long long __B)
928 {
929 return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
930 }
931 #endif
932
933 static __inline __m128d
934 _mm_cvtss_sd (__m128d __A, __m128 __B)
935 {
936 return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
937 }
938
939 #define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
940
941 static __inline __m128d
942 _mm_unpackhi_pd (__m128d __A, __m128d __B)
943 {
944 return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
945 }
946
947 static __inline __m128d
948 _mm_unpacklo_pd (__m128d __A, __m128d __B)
949 {
950 return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
951 }
952
953 static __inline __m128d
954 _mm_loadh_pd (__m128d __A, double const *__B)
955 {
956 return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
957 }
958
959 static __inline void
960 _mm_storeh_pd (double *__A, __m128d __B)
961 {
962 __builtin_ia32_storehpd (__A, (__v2df)__B);
963 }
964
965 static __inline __m128d
966 _mm_loadl_pd (__m128d __A, double const *__B)
967 {
968 return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
969 }
970
971 static __inline void
972 _mm_storel_pd (double *__A, __m128d __B)
973 {
974 __builtin_ia32_storelpd (__A, (__v2df)__B);
975 }
976
977 static __inline int
978 _mm_movemask_pd (__m128d __A)
979 {
980 return __builtin_ia32_movmskpd ((__v2df)__A);
981 }
982
983 static __inline __m128i
984 _mm_packs_epi16 (__m128i __A, __m128i __B)
985 {
986 return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
987 }
988
989 static __inline __m128i
990 _mm_packs_epi32 (__m128i __A, __m128i __B)
991 {
992 return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
993 }
994
995 static __inline __m128i
996 _mm_packus_epi16 (__m128i __A, __m128i __B)
997 {
998 return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
999 }
1000
1001 static __inline __m128i
1002 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1003 {
1004 return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
1005 }
1006
1007 static __inline __m128i
1008 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1009 {
1010 return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
1011 }
1012
1013 static __inline __m128i
1014 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1015 {
1016 return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
1017 }
1018
1019 static __inline __m128i
1020 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1021 {
1022 return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
1023 }
1024
1025 static __inline __m128i
1026 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1027 {
1028 return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
1029 }
1030
1031 static __inline __m128i
1032 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1033 {
1034 return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
1035 }
1036
1037 static __inline __m128i
1038 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1039 {
1040 return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
1041 }
1042
1043 static __inline __m128i
1044 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1045 {
1046 return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
1047 }
1048
1049 static __inline __m128i
1050 _mm_add_epi8 (__m128i __A, __m128i __B)
1051 {
1052 return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
1053 }
1054
1055 static __inline __m128i
1056 _mm_add_epi16 (__m128i __A, __m128i __B)
1057 {
1058 return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
1059 }
1060
1061 static __inline __m128i
1062 _mm_add_epi32 (__m128i __A, __m128i __B)
1063 {
1064 return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
1065 }
1066
1067 static __inline __m128i
1068 _mm_add_epi64 (__m128i __A, __m128i __B)
1069 {
1070 return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
1071 }
1072
1073 static __inline __m128i
1074 _mm_adds_epi8 (__m128i __A, __m128i __B)
1075 {
1076 return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
1077 }
1078
1079 static __inline __m128i
1080 _mm_adds_epi16 (__m128i __A, __m128i __B)
1081 {
1082 return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
1083 }
1084
1085 static __inline __m128i
1086 _mm_adds_epu8 (__m128i __A, __m128i __B)
1087 {
1088 return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
1089 }
1090
1091 static __inline __m128i
1092 _mm_adds_epu16 (__m128i __A, __m128i __B)
1093 {
1094 return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
1095 }
1096
1097 static __inline __m128i
1098 _mm_sub_epi8 (__m128i __A, __m128i __B)
1099 {
1100 return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
1101 }
1102
1103 static __inline __m128i
1104 _mm_sub_epi16 (__m128i __A, __m128i __B)
1105 {
1106 return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
1107 }
1108
1109 static __inline __m128i
1110 _mm_sub_epi32 (__m128i __A, __m128i __B)
1111 {
1112 return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
1113 }
1114
1115 static __inline __m128i
1116 _mm_sub_epi64 (__m128i __A, __m128i __B)
1117 {
1118 return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
1119 }
1120
1121 static __inline __m128i
1122 _mm_subs_epi8 (__m128i __A, __m128i __B)
1123 {
1124 return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
1125 }
1126
1127 static __inline __m128i
1128 _mm_subs_epi16 (__m128i __A, __m128i __B)
1129 {
1130 return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
1131 }
1132
1133 static __inline __m128i
1134 _mm_subs_epu8 (__m128i __A, __m128i __B)
1135 {
1136 return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
1137 }
1138
1139 static __inline __m128i
1140 _mm_subs_epu16 (__m128i __A, __m128i __B)
1141 {
1142 return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
1143 }
1144
1145 static __inline __m128i
1146 _mm_madd_epi16 (__m128i __A, __m128i __B)
1147 {
1148 return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
1149 }
1150
1151 static __inline __m128i
1152 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1153 {
1154 return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
1155 }
1156
1157 static __inline __m128i
1158 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1159 {
1160 return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
1161 }
1162
1163 static __inline __m64
1164 _mm_mul_su32 (__m64 __A, __m64 __B)
1165 {
1166 return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
1167 }
1168
1169 static __inline __m128i
1170 _mm_mul_epu32 (__m128i __A, __m128i __B)
1171 {
1172 return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
1173 }
1174
1175 static __inline __m128i
1176 _mm_slli_epi16 (__m128i __A, int __B)
1177 {
1178 return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
1179 }
1180
1181 static __inline __m128i
1182 _mm_slli_epi32 (__m128i __A, int __B)
1183 {
1184 return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
1185 }
1186
1187 static __inline __m128i
1188 _mm_slli_epi64 (__m128i __A, int __B)
1189 {
1190 return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
1191 }
1192
1193 static __inline __m128i
1194 _mm_srai_epi16 (__m128i __A, int __B)
1195 {
1196 return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
1197 }
1198
1199 static __inline __m128i
1200 _mm_srai_epi32 (__m128i __A, int __B)
1201 {
1202 return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
1203 }
1204
1205 #if 0
1206 static __m128i __attribute__((__always_inline__))
1207 _mm_srli_si128 (__m128i __A, const int __B)
1208 {
1209 return ((__m128i)__builtin_ia32_psrldqi128 (__A, __B))
1210 }
1211
1212 static __m128i __attribute__((__always_inline__))
1213 _mm_srli_si128 (__m128i __A, const int __B)
1214 {
1215 return ((__m128i)__builtin_ia32_pslldqi128 (__A, __B))
1216 }
1217 #else
1218 #define _mm_srli_si128(__A, __B) \
1219 ((__m128i)__builtin_ia32_psrldqi128 (__A, (__B) * 8))
1220 #define _mm_slli_si128(__A, __B) \
1221 ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
1222 #endif
1223
1224 static __inline __m128i
1225 _mm_srli_epi16 (__m128i __A, int __B)
1226 {
1227 return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
1228 }
1229
1230 static __inline __m128i
1231 _mm_srli_epi32 (__m128i __A, int __B)
1232 {
1233 return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
1234 }
1235
1236 static __inline __m128i
1237 _mm_srli_epi64 (__m128i __A, int __B)
1238 {
1239 return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
1240 }
1241
1242 static __inline __m128i
1243 _mm_sll_epi16 (__m128i __A, __m128i __B)
1244 {
1245 return _mm_slli_epi16 (__A, _mm_cvtsi128_si32 (__B));
1246 }
1247
1248 static __inline __m128i
1249 _mm_sll_epi32 (__m128i __A, __m128i __B)
1250 {
1251 return _mm_slli_epi32 (__A, _mm_cvtsi128_si32 (__B));
1252 }
1253
1254 static __inline __m128i
1255 _mm_sll_epi64 (__m128i __A, __m128i __B)
1256 {
1257 return _mm_slli_epi64 (__A, _mm_cvtsi128_si32 (__B));
1258 }
1259
1260 static __inline __m128i
1261 _mm_sra_epi16 (__m128i __A, __m128i __B)
1262 {
1263 return _mm_srai_epi16 (__A, _mm_cvtsi128_si32 (__B));
1264 }
1265
1266 static __inline __m128i
1267 _mm_sra_epi32 (__m128i __A, __m128i __B)
1268 {
1269 return _mm_srai_epi32 (__A, _mm_cvtsi128_si32 (__B));
1270 }
1271
1272 static __inline __m128i
1273 _mm_srl_epi16 (__m128i __A, __m128i __B)
1274 {
1275 return _mm_srli_epi16 (__A, _mm_cvtsi128_si32 (__B));
1276 }
1277
1278 static __inline __m128i
1279 _mm_srl_epi32 (__m128i __A, __m128i __B)
1280 {
1281 return _mm_srli_epi32 (__A, _mm_cvtsi128_si32 (__B));
1282 }
1283
1284 static __inline __m128i
1285 _mm_srl_epi64 (__m128i __A, __m128i __B)
1286 {
1287 return _mm_srli_epi64 (__A, _mm_cvtsi128_si32 (__B));
1288 }
1289
1290 static __inline __m128i
1291 _mm_and_si128 (__m128i __A, __m128i __B)
1292 {
1293 return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
1294 }
1295
1296 static __inline __m128i
1297 _mm_andnot_si128 (__m128i __A, __m128i __B)
1298 {
1299 return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
1300 }
1301
1302 static __inline __m128i
1303 _mm_or_si128 (__m128i __A, __m128i __B)
1304 {
1305 return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
1306 }
1307
1308 static __inline __m128i
1309 _mm_xor_si128 (__m128i __A, __m128i __B)
1310 {
1311 return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
1312 }
1313
1314 static __inline __m128i
1315 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1316 {
1317 return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
1318 }
1319
1320 static __inline __m128i
1321 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1322 {
1323 return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
1324 }
1325
1326 static __inline __m128i
1327 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1328 {
1329 return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
1330 }
1331
1332 static __inline __m128i
1333 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1334 {
1335 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
1336 }
1337
1338 static __inline __m128i
1339 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1340 {
1341 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
1342 }
1343
1344 static __inline __m128i
1345 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1346 {
1347 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
1348 }
1349
1350 static __inline __m128i
1351 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1352 {
1353 return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
1354 }
1355
1356 static __inline __m128i
1357 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1358 {
1359 return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
1360 }
1361
1362 static __inline __m128i
1363 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1364 {
1365 return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
1366 }
1367
1368 #define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
1369
1370 #define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C))
1371
1372 static __inline __m128i
1373 _mm_max_epi16 (__m128i __A, __m128i __B)
1374 {
1375 return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
1376 }
1377
1378 static __inline __m128i
1379 _mm_max_epu8 (__m128i __A, __m128i __B)
1380 {
1381 return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
1382 }
1383
1384 static __inline __m128i
1385 _mm_min_epi16 (__m128i __A, __m128i __B)
1386 {
1387 return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
1388 }
1389
1390 static __inline __m128i
1391 _mm_min_epu8 (__m128i __A, __m128i __B)
1392 {
1393 return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
1394 }
1395
1396 static __inline int
1397 _mm_movemask_epi8 (__m128i __A)
1398 {
1399 return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
1400 }
1401
1402 static __inline __m128i
1403 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
1404 {
1405 return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
1406 }
1407
1408 #define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __B))
1409 #define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
1410 #define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
1411
1412 static __inline void
1413 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
1414 {
1415 __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
1416 }
1417
1418 static __inline __m128i
1419 _mm_avg_epu8 (__m128i __A, __m128i __B)
1420 {
1421 return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
1422 }
1423
1424 static __inline __m128i
1425 _mm_avg_epu16 (__m128i __A, __m128i __B)
1426 {
1427 return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
1428 }
1429
1430 static __inline __m128i
1431 _mm_sad_epu8 (__m128i __A, __m128i __B)
1432 {
1433 return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
1434 }
1435
1436 static __inline void
1437 _mm_stream_si32 (int *__A, int __B)
1438 {
1439 __builtin_ia32_movnti (__A, __B);
1440 }
1441
1442 static __inline void
1443 _mm_stream_si128 (__m128i *__A, __m128i __B)
1444 {
1445 __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
1446 }
1447
1448 static __inline void
1449 _mm_stream_pd (double *__A, __m128d __B)
1450 {
1451 __builtin_ia32_movntpd (__A, (__v2df)__B);
1452 }
1453
1454 static __inline __m128i
1455 _mm_movpi64_epi64 (__m64 __A)
1456 {
1457 return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A);
1458 }
1459
1460 static __inline void
1461 _mm_clflush (void const *__A)
1462 {
1463 return __builtin_ia32_clflush (__A);
1464 }
1465
1466 static __inline void
1467 _mm_lfence (void)
1468 {
1469 __builtin_ia32_lfence ();
1470 }
1471
1472 static __inline void
1473 _mm_mfence (void)
1474 {
1475 __builtin_ia32_mfence ();
1476 }
1477
1478 static __inline __m128i
1479 _mm_cvtsi32_si128 (int __A)
1480 {
1481 return (__m128i) __builtin_ia32_loadd (&__A);
1482 }
1483
1484 #ifdef __x86_64__
1485 static __inline __m128i
1486 _mm_cvtsi64x_si128 (long long __A)
1487 {
1488 return (__m128i) __builtin_ia32_movq2dq (__A);
1489 }
1490 #endif
1491
1492 #endif /* __SSE2__ */
1493
1494 #endif /* _EMMINTRIN_H_INCLUDED */