[x86] Don't use builtins for unaligned load/store

author Marc Glisse <marc.glisse@inria.fr>

Wed, 31 Aug 2016 11:56:37 +0000 (13:56 +0200)

committer Marc Glisse <glisse@gcc.gnu.org>

Wed, 31 Aug 2016 11:56:37 +0000 (11:56 +0000)
author Marc Glisse <marc.glisse@inria.fr>
Wed, 31 Aug 2016 11:56:37 +0000 (13:56 +0200)
committer Marc Glisse <glisse@gcc.gnu.org>
Wed, 31 Aug 2016 11:56:37 +0000 (11:56 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index e9dd53779c808daa41f6e89b4e7231b67b9931a6..e9dfc9186a95fb5c16a908472f896dc16d8ea0ec 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,22 @@
+2016-08-31  Marc Glisse  <marc.glisse@inria.fr>
+
+       * config/i386/avx512fintrin.h (__m512_u, __m512i_u, __m512d_u):
+       New types.
+       (_mm512_loadu_pd, _mm512_storeu_pd, _mm512_loadu_ps,
+       _mm512_storeu_ps, _mm512_loadu_si512, _mm512_storeu_si512):
+       Replace builtin with vector extension.
+       * config/i386/avxintrin.h (__m256_u, __m256i_u, __m256d_u):
+       New types.
+       (_mm256_loadu_pd, _mm256_storeu_pd, _mm256_loadu_ps,
+       _mm256_storeu_ps, _mm256_loadu_si256, _mm256_storeu_si256):
+       Replace builtin with vector extension.
+       * config/i386/emmintrin.h (__m128i_u, __m128d_u): New types.
+       (_mm_loadu_pd, _mm_storeu_pd, _mm_loadu_si128, _mm_storeu_si128):
+       Replace builtin with vector extension.
+       * config/i386/xmmintrin.h (__m128_u): New type.
+       (_mm_loadu_ps, _mm_storeu_ps): Replace builtin with vector extension.
+       (_mm_load_ps, _mm_store_ps): Simplify.
+
  2016-08-31  Eric Botcazou  <ebotcazou@adacore.com>
  
         * config/arm/arm.c (thumb1_size_rtx_costs) <SET>: Add missing guard.
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h

index 688e8dc00f5197b5a380b3b35fb411b3e6bb5309..2372c83989d1d98b40a150c09427a9c58c1d81a5 100644 (file)
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -52,6 +52,11 @@ typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
  typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
  typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
  
+/* Unaligned version of the same type.  */
+typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+
  typedef unsigned char  __mmask8;
  typedef unsigned short __mmask16;
  
@@ -5674,10 +5679,7 @@ extern __inline __m512d
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  _mm512_loadu_pd (void const *__P)
  {
-  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
-                                                  (__v8df)
-                                                  _mm512_undefined_pd (),
-                                                  (__mmask8) -1);
+  return *(__m512d_u *)__P;
  }
  
  extern __inline __m512d
@@ -5703,8 +5705,7 @@ extern __inline void
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  _mm512_storeu_pd (void *__P, __m512d __A)
  {
-  __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A,
-                                  (__mmask8) -1);
+  *(__m512d_u *)__P = __A;
  }
  
  extern __inline void
@@ -5719,10 +5720,7 @@ extern __inline __m512
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  _mm512_loadu_ps (void const *__P)
  {
-  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
-                                                 (__v16sf)
-                                                 _mm512_undefined_ps (),
-                                                 (__mmask16) -1);
+  return *(__m512_u *)__P;
  }
  
  extern __inline __m512
@@ -5748,8 +5746,7 @@ extern __inline void
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  _mm512_storeu_ps (void *__P, __m512 __A)
  {
-  __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A,
-                                  (__mmask16) -1);
+  *(__m512_u *)__P = __A;
  }
  
  extern __inline void
@@ -5791,10 +5788,7 @@ extern __inline __m512i
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  _mm512_loadu_si512 (void const *__P)
  {
-  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
-                                                    (__v16si)
-                                                    _mm512_setzero_si512 (),
-                                                    (__mmask16) -1);
+  return *(__m512i_u *)__P;
  }
  
  extern __inline __m512i
@@ -5820,8 +5814,7 @@ extern __inline void
  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
  _mm512_storeu_si512 (void *__P, __m512i __A)
  {
-  __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
-                                    (__mmask16) -1);
+  *(__m512i_u *)__P = __A;
  }
  
  extern __inline void
diff --git a/gcc/config/i386/avxintrin.h b/gcc/config/i386/avxintrin.h

index 9519400176d4310ee2b62c0e45ff26f0743afe4f..9cd9aab918197bbfa4514e1f64fe33ad16b2c151 100644 (file)
--- a/gcc/config/i386/avxintrin.h
+++ b/gcc/config/i386/avxintrin.h
@@ -58,6 +58,17 @@ typedef long long __m256i __attribute__ ((__vector_size__ (32),
  typedef double __m256d __attribute__ ((__vector_size__ (32),
                                        __may_alias__));
  
+/* Unaligned version of the same types.  */
+typedef float __m256_u __attribute__ ((__vector_size__ (32),
+                                      __may_alias__,
+                                      __aligned__ (1)));
+typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
+                                           __may_alias__,
+                                           __aligned__ (1)));
+typedef double __m256d_u __attribute__ ((__vector_size__ (32),
+                                        __may_alias__,
+                                        __aligned__ (1)));
+
  /* Compare predicates for scalar and packed compare intrinsics.  */
  
  /* Equal (ordered, non-signaling)  */
@@ -857,25 +868,25 @@ _mm256_store_ps (float *__P, __m256 __A)
  extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm256_loadu_pd (double const *__P)
  {
-  return (__m256d) __builtin_ia32_loadupd256 (__P);
+  return *(__m256d_u *)__P;
  }
  
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm256_storeu_pd (double *__P, __m256d __A)
  {
-  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
+  *(__m256d_u *)__P = __A;
  }
  
  extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm256_loadu_ps (float const *__P)
  {
-  return (__m256) __builtin_ia32_loadups256 (__P);
+  return *(__m256_u *)__P;
  }
  
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm256_storeu_ps (float *__P, __m256 __A)
  {
-  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
+  *(__m256_u *)__P = __A;
  }
  
  extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -891,15 +902,15 @@ _mm256_store_si256 (__m256i *__P, __m256i __A)
  }
  
  extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_loadu_si256 (__m256i const *__P)
+_mm256_loadu_si256 (__m256i_u const *__P)
  {
-  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
+  return *__P;
  }
  
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_storeu_si256 (__m256i *__P, __m256i __A)
+_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
  {
-  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
+  *__P = __A;
  }
  
  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h

index 8652fe96d0ec52f38e52a7180fb32602e0f28b38..b299cbc8178cbae765b8997b5032fe8e96c07657 100644 (file)
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -52,6 +52,10 @@ typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
  typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
  typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
  
+/* Unaligned version of the same types.  */
+typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
  /* Create a selector for use with the SHUFPD instruction.  */
  #define _MM_SHUFFLE2(fp1,fp0) \
   (((fp1) << 1) | (fp0))
@@ -123,7 +127,7 @@ _mm_load_pd (double const *__P)
  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm_loadu_pd (double const *__P)
  {
-  return __builtin_ia32_loadupd (__P);
+  return *(__m128d_u *)__P;
  }
  
  /* Create a vector with all two elements equal to *P.  */
@@ -165,7 +169,7 @@ _mm_store_pd (double *__P, __m128d __A)
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm_storeu_pd (double *__P, __m128d __A)
  {
-  __builtin_ia32_storeupd (__P, __A);
+  *(__m128d_u *)__P = __A;
  }
  
  /* Stores the lower DPFP value.  */
@@ -693,9 +697,9 @@ _mm_load_si128 (__m128i const *__P)
  }
  
  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_loadu_si128 (__m128i const *__P)
+_mm_loadu_si128 (__m128i_u const *__P)
  {
-  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
+  return *__P;
  }
  
  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -711,9 +715,9 @@ _mm_store_si128 (__m128i *__P, __m128i __B)
  }
  
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_storeu_si128 (__m128i *__P, __m128i __B)
+_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
  {
-  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
+  *__P = __B;
  }
  
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h

index ffe5771b1c5848106f59858a0cb2e1fd32fbe24b..26516e24a003dfca42b252050e34d78ea0ca97bf 100644 (file)
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -68,6 +68,9 @@ _mm_prefetch (const void *__P, enum _mm_hint __I)
     vector types, and their scalar components.  */
  typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
  
+/* Unaligned version of the same type.  */
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
  /* Internal data types for implementing the intrinsics.  */
  typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  
@@ -921,14 +924,14 @@ _mm_load_ps1 (float const *__P)
  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm_load_ps (float const *__P)
  {
-  return (__m128) *(__v4sf *)__P;
+  return *(__m128 *)__P;
  }
  
  /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm_loadu_ps (float const *__P)
  {
-  return (__m128) __builtin_ia32_loadups (__P);
+  return *(__m128_u *)__P;
  }
  
  /* Load four SPFP values in reverse order.  The address must be aligned.  */
@@ -970,14 +973,14 @@ _mm_cvtss_f32 (__m128 __A)
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm_store_ps (float *__P, __m128 __A)
  {
-  *(__v4sf *)__P = (__v4sf)__A;
+  *(__m128 *)__P = __A;
  }
  
  /* Store four SPFP values.  The address need not be 16-byte aligned.  */
  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _mm_storeu_ps (float *__P, __m128 __A)
  {
-  __builtin_ia32_storeups (__P, (__v4sf)__A);
+  *(__m128_u *)__P = __A;
  }
  
  /* Store the lower SPFP value across four words.  */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index c8b85fa83eaf66b5a6d6ebf606b18e22aff6f8be..61dc053a096d67b4dd12b3cf7c361c218471064d 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2016-08-31  Marc Glisse  <marc.glisse@inria.fr>
+
+       * gcc.target/i386/pr59539-2.c: Adapt options.
+       * gcc.target/i386/avx512f-vmovdqu32-1.c: Relax expected asm.
+
  2016-08-31  Paul Thomas  <pault@gcc.gnu.org>
  
         PR fortran/77418
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c

index 744bfbc46067475afc274f8b6ec2249a75a846fe..a6f1c290aeb07c52bcc864e881cf3e910f2788f3 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c
@@ -1,9 +1,9 @@
  /* { dg-do compile } */
  /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vmovdqu\[36\]\[24\]\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu(?:32|64)\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
  /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
  /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu(?:32|64)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */
  /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
  
  #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/pr59539-2.c b/gcc/testsuite/gcc.target/i386/pr59539-2.c

index b53b8c407abaeac3e4176dd6834e3ef28be79c60..eaa7057d1193f9b6012a77d07781340d88ac0b71 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/pr59539-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr59539-2.c
@@ -1,6 +1,6 @@
  /* PR target/59539 */
  /* { dg-do compile } */
-/* { dg-options "-O2 -mavx2" } */
+/* { dg-options "-O2 -march=haswell" } */
  
  #include <immintrin.h>
author	Marc Glisse <marc.glisse@inria.fr>
	Wed, 31 Aug 2016 11:56:37 +0000 (13:56 +0200)
committer	Marc Glisse <glisse@gcc.gnu.org>
	Wed, 31 Aug 2016 11:56:37 +0000 (11:56 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/i386/avx512fintrin.h		patch \| blob \| history
gcc/config/i386/avxintrin.h		patch \| blob \| history
gcc/config/i386/emmintrin.h		patch \| blob \| history
gcc/config/i386/xmmintrin.h		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c		patch \| blob \| history
gcc/testsuite/gcc.target/i386/pr59539-2.c		patch \| blob \| history