radv: set writes_memory for global memory stores/atomics

[mesa.git] / src / gallium / auxiliary / util / u_sse.h
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h

index 1df6c872677d6e5467bf2ab15f706fd5fc2b959a..cae4138ba01baf42e1a60e8ecc060d41fa14f07b 100644 (file)
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -1,6 +1,6 @@
  /**************************************************************************
   *
  /**************************************************************************
   *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008 VMware, Inc.
   * All Rights Reserved.
   *
   * Permission is hereby granted, free of charge, to any person obtaining a
   * All Rights Reserved.
   *
   * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
   * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
   * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -44,33 +44,6 @@
  #include <emmintrin.h>
  
  
  #include <emmintrin.h>
  
  
-/* MSVC before VC8 does not support the _mm_castxxx_yyy */
-#if defined(_MSC_VER) && _MSC_VER < 1500
-
-union __declspec(align(16)) m128_types {
-   __m128 m128;
-   __m128i m128i;
-   __m128d m128d;
-};
-
-static __inline __m128
-_mm_castsi128_ps(__m128i a)
-{
-   union m128_types u;
-   u.m128i = a;
-   return u.m128;
-}
-
-static __inline __m128i
-_mm_castps_si128(__m128 a)
-{
-   union m128_types u;
-   u.m128 = a;
-   return u.m128i;
-}
-
-#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
-
  union m128i {
     __m128i m;
     ubyte ub[16];
  union m128i {
     __m128i m;
     ubyte ub[16];
@@ -78,7 +51,7 @@ union m128i {
     uint ui[4];
  };
  
     uint ui[4];
  };
  
-static INLINE void u_print_epi8(const char *name, __m128i r)
+static inline void u_print_epi8(const char *name, __m128i r)
  {
     union { __m128i m; ubyte ub[16]; } u;
     u.m = r;
  {
     union { __m128i m; ubyte ub[16]; } u;
     u.m = r;
@@ -107,7 +80,7 @@ static INLINE void u_print_epi8(const char *name, __m128i r)
                  u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
  }
  
                  u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
  }
  
-static INLINE void u_print_epi16(const char *name, __m128i r)
+static inline void u_print_epi16(const char *name, __m128i r)
  {
     union { __m128i m; ushort us[8]; } u;
     u.m = r;
  {
     union { __m128i m; ushort us[8]; } u;
     u.m = r;
@@ -126,7 +99,7 @@ static INLINE void u_print_epi16(const char *name, __m128i r)
                  u.us[4],  u.us[5],  u.us[6],  u.us[7]);
  }
  
                  u.us[4],  u.us[5],  u.us[6],  u.us[7]);
  }
  
-static INLINE void u_print_epi32(const char *name, __m128i r)
+static inline void u_print_epi32(const char *name, __m128i r)
  {
     union { __m128i m; uint ui[4]; } u;
     u.m = r;
  {
     union { __m128i m; uint ui[4]; } u;
     u.m = r;
@@ -140,7 +113,7 @@ static INLINE void u_print_epi32(const char *name, __m128i r)
                  u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
  }
  
                  u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
  }
  
-static INLINE void u_print_ps(const char *name, __m128 r)
+static inline void u_print_ps(const char *name, __m128 r)
  {
     union { __m128 m; float f[4]; } u;
     u.m = r;
  {
     union { __m128 m; float f[4]; } u;
     u.m = r;
@@ -175,7 +148,12 @@ static INLINE void u_print_ps(const char *name, __m128 r)
   * MSVC will never get in here as its intrinsics support do not rely on
   * compiler command line options.
   */
   * MSVC will never get in here as its intrinsics support do not rely on
   * compiler command line options.
   */
-static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+static __inline __m128i
+#ifdef __clang__
+   __attribute__((__always_inline__, __nodebug__))
+#else
+   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#endif
  _mm_shuffle_epi8(__m128i a, __m128i mask)
  {
      __m128i result;
  _mm_shuffle_epi8(__m128i a, __m128i mask)
  {
      __m128i result;
@@ -188,20 +166,55 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
  #endif /* !PIPE_ARCH_SSSE3 */
  
  
  #endif /* !PIPE_ARCH_SSSE3 */
  
  
+/*
+ * Provide an SSE implementation of _mm_mul_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * Basically, albeit surprising at first (and second, and third...) look
+ * if a * b is done signed instead of unsigned, can just
+ * subtract b from the high bits of the result if a is negative
+ * (and the same for a if b is negative). Modular arithmetic at its best!
+ *
+ * So for int32 a,b in crude pseudo-code ("*" here denoting a widening mul)
+ * fixupb = (signmask(b) & a) << 32ULL
+ * fixupa = (signmask(a) & b) << 32ULL
+ * a * b = (unsigned)a * (unsigned)b - fixupb - fixupa
+ * = (unsigned)a * (unsigned)b -(fixupb + fixupa)
+ *
+ * This does both lo (dwords 0/2) and hi parts (1/3) at the same time due
+ * to some optimization potential.
+ */
+static inline __m128i
+mm_mullohi_epi32(const __m128i a, const __m128i b, __m128i *res13)
+{
+   __m128i a13, b13, mul02, mul13;
+   __m128i anegmask, bnegmask, fixup, fixup02, fixup13;
+   a13 = _mm_shuffle_epi32(a, _MM_SHUFFLE(2,3,0,1));
+   b13 = _mm_shuffle_epi32(b, _MM_SHUFFLE(2,3,0,1));
+   anegmask = _mm_srai_epi32(a, 31);
+   bnegmask = _mm_srai_epi32(b, 31);
+   fixup = _mm_add_epi32(_mm_and_si128(anegmask, b),
+                         _mm_and_si128(bnegmask, a));
+   mul02 = _mm_mul_epu32(a, b);
+   mul13 = _mm_mul_epu32(a13, b13);
+   fixup02 = _mm_slli_epi64(fixup, 32);
+   fixup13 = _mm_and_si128(fixup, _mm_set_epi32(-1,0,-1,0));
+   *res13 = _mm_sub_epi64(mul13, fixup13);
+   return _mm_sub_epi64(mul02, fixup02);
+}
  
  
  /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
   * _mm_mul_epu32().
   *
  
  
  /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
   * _mm_mul_epu32().
   *
- * I suspect this works fine for us because one of our operands is
- * always positive, but not sure that this can be used for general
- * signed integer multiplication.
+ * This always works regardless the signs of the operands, since
+ * the high bits (which would be different) aren't used.
   *
   * This seems close enough to the speed of SSE4 and the real
   * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
   * dependency at this point.
   */
   *
   * This seems close enough to the speed of SSE4 and the real
   * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
   * dependency at this point.
   */
-static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+static inline __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
  {
     __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
     __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
  {
     __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
     __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
@@ -210,6 +223,12 @@ static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
  
     /* Interleave the results, either with shuffles or (slightly
      * faster) direct bit operations:
  
     /* Interleave the results, either with shuffles or (slightly
      * faster) direct bit operations:
+    * XXX: might be only true for some cpus (in particular 65nm
+    * Core 2). On most cpus (including that Core 2, but not Nehalem...)
+    * using _mm_shuffle_ps/_mm_shuffle_epi32 might also be faster
+    * than using the 3 instructions below. But logic should be fine
+    * as well, we can't have optimal solution for all cpus (if anything,
+    * should just use _mm_mullo_epi32() if sse41 is available...).
      */
  #if 0
     __m128i ba8             = _mm_shuffle_epi32(ba, 8);
      */
  #if 0
     __m128i ba8             = _mm_shuffle_epi32(ba, 8);
@@ -226,7 +245,7 @@ static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
  }
  
  
  }
  
  
-static INLINE void
+static inline void
  transpose4_epi32(const __m128i * restrict a,
                   const __m128i * restrict b,
                   const __m128i * restrict c,
  transpose4_epi32(const __m128i * restrict a,
                   const __m128i * restrict b,
                   const __m128i * restrict c,
@@ -236,17 +255,44 @@ transpose4_epi32(const __m128i * restrict a,
                   __m128i * restrict q,
                   __m128i * restrict r)
  {
                   __m128i * restrict q,
                   __m128i * restrict r)
  {
-  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
-  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
-  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
-  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
-
-  *o = _mm_unpacklo_epi64(t0, t1);
-  *p = _mm_unpackhi_epi64(t0, t1);
-  *q = _mm_unpacklo_epi64(t2, t3);
-  *r = _mm_unpackhi_epi64(t2, t3);
+   __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
  }
  
  }
  
+
+/*
+ * Same as above, except the first two values are already interleaved
+ * (i.e. contain 64bit values).
+ */
+static inline void
+transpose2_64_2_32(const __m128i * restrict a01,
+                   const __m128i * restrict a23,
+                   const __m128i * restrict c,
+                   const __m128i * restrict d,
+                   __m128i * restrict o,
+                   __m128i * restrict p,
+                   __m128i * restrict q,
+                   __m128i * restrict r)
+{
+   __m128i t0 = *a01;
+   __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+   __m128i t2 = *a23;
+   __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+   *o = _mm_unpacklo_epi64(t0, t1);
+   *p = _mm_unpackhi_epi64(t0, t1);
+   *q = _mm_unpacklo_epi64(t2, t3);
+   *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+
  #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
  
  
  #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))