xmmintrin.h (_MM_TRANSPOSE4_PS): Rewrite using high/low moves and unpack to speed up.
authorEvan Cheng <evan.cheng@apple.com>
Wed, 30 Nov 2005 06:29:07 +0000 (06:29 +0000)
committerEric Christopher <echristo@gcc.gnu.org>
Wed, 30 Nov 2005 06:29:07 +0000 (06:29 +0000)
2005-11-29  Evan Cheng  <evan.cheng@apple.com>

        * config/i386/xmmintrin.h (_MM_TRANSPOSE4_PS): Rewrite using high/low
        moves and unpack to speed up.

From-SVN: r107700

gcc/ChangeLog
gcc/config/i386/xmmintrin.h

index 89674d99baa6d60fab4a57b74bfce934091e670b..9c49626ab16d79b454e644b2adddcbbccf2b1ebb 100644 (file)
@@ -1,3 +1,8 @@
+2005-11-29  Evan Cheng  <evan.cheng@apple.com>
+
+       * config/i386/xmmintrin.h (_MM_TRANSPOSE4_PS): Rewrite using high/low
+       moves and unpack to speed up.
+
 2005-11-29  David S. Miller  <davem@sunset.davemloft.net>
 
        * config/sparc/sparc.c (gen_compare_reg): Kill 2nd and 3rd
            Uros Bizjak  <uros@kss-loka.si>
 
        PR middle-end/20219
-       * fold-const.c (fold binary) <RDIV_EXPR>: Optimize 
+       * fold-const.c (fold binary) <RDIV_EXPR>: Optimize
        sin(x)/tan(x) as cos(x) and tan(x)/sin(x) as 1.0/cos(x)
        when flag_unsafe_math_optimizations is set and
        we don't care about NaNs or Infinities.
index b80d6b58ec7f023d1dafb7851a9339f4f7d88be5..fb4d38c038bde9eef60407d3bdaa85447403fa89 100644 (file)
@@ -1197,14 +1197,14 @@ _mm_pause (void)
 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                      \
 do {                                                                   \
   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);   \
-  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);              \
-  __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE);              \
-  __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44);              \
-  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);              \
-  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);                   \
-  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);                   \
-  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);                   \
-  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);                   \
+  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);                  \
+  __v4sf __t2 = __builtin_ia32_unpcklps (__r2, __r3);                  \
+  __v4sf __t1 = __builtin_ia32_unpckhps (__r0, __r1);                  \
+  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);                  \
+  (row0) = __builtin_ia32_movlhps (__t0, __t1);                                \
+  (row1) = __builtin_ia32_movhlps (__t1, __t0);                                \
+  (row2) = __builtin_ia32_movlhps (__t2, __t3);                                \
+  (row3) = __builtin_ia32_movhlps (__t3, __t2);                                \
 } while (0)
 
 /* For backward source compatibility.  */