From be7724ed748407d7edb5b2b3958c37a9745ec40a Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Wed, 30 Nov 2005 06:29:07 +0000 Subject: [PATCH] xmmintrin.h (_MM_TRANSPOSE4_PS): Rewrite using high/low moves and unpack to speed up. 2005-11-29 Evan Cheng * config/i386/xmmintrin.h (_MM_TRANSPOSE4_PS): Rewrite using high/low moves and unpack to speed up. From-SVN: r107700 --- gcc/ChangeLog | 7 ++++++- gcc/config/i386/xmmintrin.h | 16 ++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 89674d99baa..9c49626ab16 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2005-11-29 Evan Cheng + + * config/i386/xmmintrin.h (_MM_TRANSPOSE4_PS): Rewrite using high/low + moves and unpack to speed up. + 2005-11-29 David S. Miller * config/sparc/sparc.c (gen_compare_reg): Kill 2nd and 3rd @@ -107,7 +112,7 @@ Uros Bizjak PR middle-end/20219 - * fold-const.c (fold binary) : Optimize + * fold-const.c (fold binary) : Optimize sin(x)/tan(x) as cos(x) and tan(x)/sin(x) as 1.0/cos(x) when flag_unsafe_math_optimizations is set and we don't care about NaNs or Infinities. diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index b80d6b58ec7..fb4d38c038b 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -1197,14 +1197,14 @@ _mm_pause (void) #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ - __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44); \ - __v4sf __t2 = __builtin_ia32_shufps (__r0, __r1, 0xEE); \ - __v4sf __t1 = __builtin_ia32_shufps (__r2, __r3, 0x44); \ - __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE); \ - (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88); \ - (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD); \ - (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88); \ - (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ + __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \ + __v4sf __t2 = __builtin_ia32_unpcklps (__r2, __r3); \ + __v4sf __t1 = __builtin_ia32_unpckhps (__r0, __r1); \ + __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \ + (row0) = __builtin_ia32_movlhps (__t0, __t1); \ + (row1) = __builtin_ia32_movhlps (__t1, __t0); \ + (row2) = __builtin_ia32_movlhps (__t2, __t3); \ + (row3) = __builtin_ia32_movhlps (__t3, __t2); \ } while (0) /* For backward source compatibility. */ -- 2.30.2