i965/tiled_memcpy: Rework the RGBA -> BGRA mem_copy functions

author Jason Ekstrand <jason.ekstrand@intel.com>

Thu, 7 Apr 2016 18:21:19 +0000 (11:21 -0700)

committer Jason Ekstrand <jason.ekstrand@intel.com>

Fri, 8 Apr 2016 19:09:15 +0000 (12:09 -0700)
author Jason Ekstrand <jason.ekstrand@intel.com>
Thu, 7 Apr 2016 18:21:19 +0000 (11:21 -0700)
committer Jason Ekstrand <jason.ekstrand@intel.com>
Fri, 8 Apr 2016 19:09:15 +0000 (12:09 -0700)
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c

index 0ffd3ffcab52e57f2d324a111b14c923ad66e6cd..66c1f9b82f6f8e449c158f547216019a420cf62e 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -56,21 +56,32 @@ static const uint32_t ytile_width = 128;
  static const uint32_t ytile_height = 32;
  static const uint32_t ytile_span = 16;
  
+/**
+ * Copy RGBA to BGRA - swap R and B.
+ */
+static inline void *
+rgba8_copy(void *dst, const void *src, size_t bytes)
+{
+   uint8_t *d = dst;
+   uint8_t const *s = src;
+
+   assert(bytes % 4 == 0);
+
+   while (bytes >= 4) {
+      d[0] = s[2];
+      d[1] = s[1];
+      d[2] = s[0];
+      d[3] = s[3];
+      d += 4;
+      s += 4;
+      bytes -= 4;
+   }
+   return dst;
+}
+
  #ifdef __SSSE3__
  static const uint8_t rgba8_permutation[16] =
     { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
-
-/* NOTE: dst must be 16-byte aligned. src may be unaligned. */
-#define rgba8_copy_16_aligned_dst(dst, src)                            \
-   _mm_store_si128((__m128i *)(dst),                                   \
-                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
-                                    *(__m128i *) rgba8_permutation))
-
-/* NOTE: src must be 16-byte aligned. dst may be unaligned. */
-#define rgba8_copy_16_aligned_src(dst, src)                            \
-   _mm_storeu_si128((__m128i *)(dst),                                  \
-                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
-                                     *(__m128i *) rgba8_permutation))
  #endif
  
  /**
@@ -82,32 +93,21 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
     uint8_t *d = dst;
     uint8_t const *s = src;
  
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      return dst;
-   }
+   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
  
-   if (bytes == 64) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_dst(d+16, s+16);
-      rgba8_copy_16_aligned_dst(d+32, s+32);
-      rgba8_copy_16_aligned_dst(d+48, s+48);
-      return dst;
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_store_si128((__m128i *)d,
+                      _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)s),
+                                       *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
     }
  #endif
  
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(d, s, bytes);
+
     return dst;
  }
  
@@ -120,32 +120,21 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
     uint8_t *d = dst;
     uint8_t const *s = src;
  
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      return dst;
-   }
+   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
  
-   if (bytes == 64) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_src(d+16, s+16);
-      rgba8_copy_16_aligned_src(d+32, s+32);
-      rgba8_copy_16_aligned_src(d+48, s+48);
-      return dst;
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_storeu_si128((__m128i *)d,
+                       _mm_shuffle_epi8(_mm_load_si128((__m128i *)s),
+                                        *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
     }
  #endif
  
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(d, s, bytes);
+
     return dst;
  }
  
@@ -404,10 +393,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        if (mem_copy == memcpy)
           return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                   dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
           return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                   dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
        else
           unreachable("not reached");
     } else {
@@ -415,10 +404,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                   dst, src, src_pitch, swizzle_bit,
                                   memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
           return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                   dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
        else
           unreachable("not reached");
     }
@@ -447,20 +436,20 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        if (mem_copy == memcpy)
           return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                   dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
           return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                   dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
        else
           unreachable("not reached");
     } else {
        if (mem_copy == memcpy)
           return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                   dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
           return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                   dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
        else
           unreachable("not reached");
     }
@@ -489,20 +478,20 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        if (mem_copy == memcpy)
           return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                   dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
           return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                   dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
        else
           unreachable("not reached");
     } else {
        if (mem_copy == memcpy)
           return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                   dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
           return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                   dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
        else
           unreachable("not reached");
     }
@@ -531,20 +520,20 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        if (mem_copy == memcpy)
           return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                   dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
           return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                   dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
        else
           unreachable("not reached");
     } else {
        if (mem_copy == memcpy)
           return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                   dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
           return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                   dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
        else
           unreachable("not reached");
     }
@@ -775,8 +764,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
        if (format == GL_BGRA) {
           *mem_copy = memcpy;
        } else if (format == GL_RGBA) {
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
        }
     } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
                (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
@@ -787,8 +775,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
           /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
            * use the same function.
            */
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
        } else if (format == GL_RGBA) {
           *mem_copy = memcpy;
        }
author	Jason Ekstrand <jason.ekstrand@intel.com>
	Thu, 7 Apr 2016 18:21:19 +0000 (11:21 -0700)
committer	Jason Ekstrand <jason.ekstrand@intel.com>
	Fri, 8 Apr 2016 19:09:15 +0000 (12:09 -0700)