Import Thomas Helstrom's SSE memcpy code from the via X.org driver.
authorKeith Whitwell <keith@tungstengraphics.com>
Mon, 23 May 2005 12:17:27 +0000 (12:17 +0000)
committerKeith Whitwell <keith@tungstengraphics.com>
Mon, 23 May 2005 12:17:27 +0000 (12:17 +0000)
Add a TextureMemCpy callback, called from texstore.c when copying
texture data via the memcpy_texture() path.
Enable this code in the via driver - 100% speedup in texdown.c results.

src/mesa/drivers/common/driverfuncs.c
src/mesa/drivers/dri/unichrome/Makefile
src/mesa/drivers/dri/unichrome/via_memcpy.c [new file with mode: 0644]
src/mesa/drivers/dri/unichrome/via_tex.c
src/mesa/drivers/dri/unichrome/via_tex.h
src/mesa/main/dd.h
src/mesa/main/texstore.c

index dcaaa9199f5811b5720742c02234ce0bce770a2c..4a8ea31a48f885b555485eea1286d880cadbeac1 100644 (file)
@@ -107,6 +107,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->DeleteTexture = _mesa_delete_texture_object;
    driver->NewTextureImage = _mesa_new_texture_image;
    driver->FreeTexImageData = _mesa_free_texture_image_data; 
+   driver->TextureMemCpy = _mesa_memcpy; 
    driver->IsTextureResident = NULL;
    driver->PrioritizeTexture = NULL;
    driver->ActiveTexture = NULL;
index 7875ba8f726f6608afeb7910195f24f3452af36e..5fe00c1bd1fa479c4e6c218905bafd4417d6bc09 100644 (file)
@@ -12,6 +12,7 @@ DRIVER_SOURCES = \
        via_fb.c \
        via_tex.c \
        via_ioctl.c \
+       via_memcpy.c \
        via_render.c \
        via_screen.c \
        via_span.c \
diff --git a/src/mesa/drivers/dri/unichrome/via_memcpy.c b/src/mesa/drivers/dri/unichrome/via_memcpy.c
new file mode 100644 (file)
index 0000000..351ba26
--- /dev/null
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2004 Thomas Hellstrom, All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sub license,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Thomas' orginal gutted for mesa by Keith Whitwell
+ */
+
+#include "via_tex.h"
+
+
+#define SSE_PREFETCH "  prefetchnta "
+#define FENCE __asm__ __volatile__ ("sfence":::"memory");
+
+
+#define PREFETCH1(arch_prefetch,from)                  \
+    __asm__ __volatile__ (                             \
+                         "1:  " arch_prefetch "(%0)\n" \
+                         arch_prefetch "32(%0)\n"      \
+                         arch_prefetch "64(%0)\n"      \
+                         arch_prefetch "96(%0)\n"      \
+                         arch_prefetch "128(%0)\n"     \
+                         arch_prefetch "160(%0)\n"     \
+                         arch_prefetch "192(%0)\n"     \
+                         arch_prefetch "256(%0)\n"     \
+                         arch_prefetch "288(%0)\n"     \
+                         "2:\n"                        \
+                         : : "r" (from) );
+
+
+
+#define small_memcpy(to,from,n)                                                \
+    {                                                                  \
+       __asm__ __volatile__(                                           \
+                            "movl %2,%%ecx\n\t"                        \
+                             "sarl $2,%%ecx\n\t"                       \
+                            "rep ; movsl\n\t"                          \
+                            "testb $2,%b2\n\t"                         \
+                            "je 1f\n\t"                                \
+                            "movsw\n"                                  \
+                            "1:\ttestb $1,%b2\n\t"                     \
+                            "je 2f\n\t"                                \
+                            "movsb\n"                                  \
+                            "2:"                                       \
+                            :"=&D" (to), "=&S" (from)                  \
+                            :"q" (n),"0" ((long) to),"1" ((long) from) \
+                            : "%ecx","memory");                        \
+    }
+
+
+#define SSE_CPY(prefetch,from,to,dummy,lcnt)                           \
+    if ((unsigned long) from & 15)                      {              \
+       __asm__ __volatile__ (                                          \
+                             "1:\n"                                    \
+                              prefetch "320(%1)\n"                     \
+                             "  movups (%1), %%xmm0\n"                 \
+                             "  movups 16(%1), %%xmm1\n"               \
+                             "  movntps %%xmm0, (%0)\n"                \
+                             "  movntps %%xmm1, 16(%0)\n"              \
+                              prefetch "352(%1)\n"                     \
+                             "  movups 32(%1), %%xmm2\n"               \
+                             "  movups 48(%1), %%xmm3\n"               \
+                             "  movntps %%xmm2, 32(%0)\n"              \
+                             "  movntps %%xmm3, 48(%0)\n"              \
+                             "  addl $64,%0\n"                         \
+                             "  addl $64,%1\n"                         \
+                             "  decl %2\n"                             \
+                             "  jne 1b\n"                              \
+                             :"=&D"(to), "=&S"(from), "=&r"(dummy)     \
+                             :"0" (to), "1" (from), "2" (lcnt): "memory"); \
+    } else {                                                           \
+       __asm__ __volatile__ (                                          \
+                             "2:\n"                                    \
+                             prefetch "320(%1)\n"                      \
+                             "  movaps (%1), %%xmm0\n"                 \
+                             "  movaps 16(%1), %%xmm1\n"               \
+                             "  movntps %%xmm0, (%0)\n"                \
+                             "  movntps %%xmm1, 16(%0)\n"              \
+                              prefetch "352(%1)\n"                     \
+                             "  movaps 32(%1), %%xmm2\n"               \
+                             "  movaps 48(%1), %%xmm3\n"               \
+                             "  movntps %%xmm2, 32(%0)\n"              \
+                             "  movntps %%xmm3, 48(%0)\n"              \
+                             "  addl $64,%0\n"                         \
+                             "  addl $64,%1\n"                         \
+                             "  decl %2\n"                             \
+                             "  jne 2b\n"                              \
+                             :"=&D"(to), "=&S"(from), "=&r"(dummy)     \
+                             :"0" (to), "1" (from), "2" (lcnt): "memory"); \
+    }
+
+
+
+/*
+ */
+void via_sse_memcpy(void *to,
+                   const void *from,
+                   size_t sz)
+
+{
+   int dummy;
+   int lcnt = sz >> 6;
+   int rest = sz & 63;
+
+   PREFETCH1(SSE_PREFETCH,from);
+
+   if (lcnt > 5) {
+      lcnt -= 5;
+      SSE_CPY(SSE_PREFETCH,from,to,dummy,lcnt);
+      lcnt = 5;
+   }
+   if (lcnt) {
+      SSE_CPY("#",from,to,dummy,lcnt);
+   }
+   if (rest) small_memcpy(to, from, rest);
+   FENCE;
+}
+
+
+
index 94cdf61accd55c8296b44f245707e2125f33147b..9672888f8eae05346699c8d1f6a91484a66a65d8 100644 (file)
@@ -947,7 +947,12 @@ void viaInitTextureFuncs(struct dd_function_table * functions)
    functions->NewTextureImage = viaNewTextureImage;
    functions->DeleteTexture = _mesa_delete_texture_object;
    functions->FreeTexImageData = viaFreeTextureImageData;
-                                           
+
+   if (getenv("VIA_NO_SSE"))
+      functions->TextureMemCpy = _mesa_memcpy;
+   else
+      functions->TextureMemCpy = via_sse_memcpy;
+
    functions->UpdateTexturePalette = 0;
    functions->IsTextureResident = viaIsTextureResident;
 }
index f6c024e4381de6d10258a38f5897b0b878d1314d..8277aada4073f2cc57a354d0a79d3992a03083e7 100644 (file)
@@ -34,4 +34,6 @@ GLboolean viaUpdateTextureState(GLcontext *ctx);
 void viaInitTextureFuncs(struct dd_function_table * functions);
 GLboolean viaSwapOutWork( struct via_context *vmesa );
 
+void via_sse_memcpy( void *to, const void *from, size_t sz );
+
 #endif
index 3746e4685c7599202efc3f6e8cd0eee31563ee64..e1f3ad545295a14c6ce7e6123b1fa4c72985b171 100644 (file)
@@ -514,6 +514,19 @@ struct dd_function_table {
     */
    void (*FreeTexImageData)( GLcontext *ctx, struct gl_texture_image *tImage );
 
+   /**
+    * Note: no context argument.  This function doesn't initially look
+    * like it belongs here, except that the driver is the only entity
+    * that knows for sure how the texture memory is allocated - via
+    * the above callbacks.  There is then an argument that the driver
+    * knows what memcpy paths might be fast.  Typically this is invoked with
+    * 
+    * to -- a pointer into texture memory allocated by NewTextureImage() above.
+    * from -- a pointer into client memory or a mesa temporary.
+    * sz -- nr bytes to copy.
+    */
+   void (*TextureMemCpy)( void *to, const void *from, size_t sz );
+
    /**
     * Called by glAreTextureResident().
     */
index d46b23c803a4d3d8efd032acfea4be534e1f9a13..9a910b21fc7b30d0d18123a143330af68e55544e 100644 (file)
@@ -671,7 +671,8 @@ _mesa_swizzle_ubyte_image(GLcontext *ctx,
  * 1D, 2D and 3D images supported.
  */
 static void
-memcpy_texture(GLuint dimensions,
+memcpy_texture(GLcontext *ctx,
+              GLuint dimensions,
                const struct gl_texture_format *dstFormat,
                GLvoid *dstAddr,
                GLint dstXoffset, GLint dstYoffset, GLint dstZoffset,
@@ -701,7 +702,7 @@ memcpy_texture(GLuint dimensions,
          dstImageStride == bytesPerImage) ||
         (srcDepth == 1))) {
       /* one big memcpy */
-      _mesa_memcpy(dstImage, srcImage, bytesPerTexture);
+      ctx->Driver.TextureMemCpy(dstImage, srcImage, bytesPerTexture);
    }
    else {
       GLint img, row;
@@ -709,7 +710,7 @@ memcpy_texture(GLuint dimensions,
          const GLubyte *srcRow = srcImage;
          GLubyte *dstRow = dstImage;
          for (row = 0; row < srcHeight; row++) {
-            _mesa_memcpy(dstRow, srcRow, bytesPerRow);
+            ctx->Driver.TextureMemCpy(dstRow, srcRow, bytesPerRow);
             dstRow += dstRowStride;
             srcRow += srcRowStride;
          }
@@ -776,7 +777,7 @@ _mesa_texstore_rgba(GLcontext *ctx, GLuint dims,
        baseInternalFormat == srcFormat &&
        srcType == CHAN_TYPE) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -862,7 +863,7 @@ _mesa_texstore_depth_component_float32(STORE_PARAMS)
        srcFormat == GL_DEPTH_COMPONENT &&
        srcType == GL_FLOAT) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -907,7 +908,7 @@ _mesa_texstore_depth_component16(STORE_PARAMS)
        srcFormat == GL_DEPTH_COMPONENT &&
        srcType == GL_UNSIGNED_SHORT) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -958,7 +959,7 @@ _mesa_texstore_rgb565(STORE_PARAMS)
        srcFormat == GL_RGB &&
        srcType == GL_UNSIGNED_SHORT_5_6_5) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1067,7 +1068,7 @@ _mesa_texstore_rgba8888(STORE_PARAMS)
       ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
        (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV))) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1173,7 +1174,7 @@ _mesa_texstore_argb8888(STORE_PARAMS)
        ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
         srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
       /* simple memcpy path (little endian) */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1187,7 +1188,7 @@ _mesa_texstore_argb8888(STORE_PARAMS)
        ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
         srcType == GL_UNSIGNED_INT_8_8_8_8)) {
       /* simple memcpy path (big endian) */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1348,7 +1349,7 @@ _mesa_texstore_rgb888(STORE_PARAMS)
        srcType == GL_UNSIGNED_BYTE &&
        littleEndian) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1453,7 +1454,7 @@ _mesa_texstore_bgr888(STORE_PARAMS)
        srcType == GL_UNSIGNED_BYTE &&
        littleEndian) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1537,7 +1538,7 @@ _mesa_texstore_argb4444(STORE_PARAMS)
        srcFormat == GL_BGRA &&
        srcType == GL_UNSIGNED_SHORT_4_4_4_4_REV) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1607,7 +1608,7 @@ _mesa_texstore_argb1555(STORE_PARAMS)
        srcFormat == GL_BGRA &&
        srcType == GL_UNSIGNED_SHORT_1_5_5_5_REV) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1680,7 +1681,7 @@ _mesa_texstore_al88(STORE_PARAMS)
        srcType == GL_UNSIGNED_BYTE &&
        littleEndian) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1744,7 +1745,7 @@ _mesa_texstore_rgb332(STORE_PARAMS)
        baseInternalFormat == GL_RGB &&
        srcFormat == GL_RGB && srcType == GL_UNSIGNED_BYTE_3_3_2) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1802,7 +1803,7 @@ _mesa_texstore_a8(STORE_PARAMS)
        baseInternalFormat == srcFormat &&
        srcType == GL_UNSIGNED_BYTE) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1856,7 +1857,7 @@ _mesa_texstore_ci8(STORE_PARAMS)
        srcFormat == GL_COLOR_INDEX &&
        srcType == GL_UNSIGNED_BYTE) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1906,7 +1907,7 @@ _mesa_texstore_ycbcr(STORE_PARAMS)
    ASSERT(baseInternalFormat == GL_YCBCR_MESA);
 
    /* always just memcpy since no pixel transfer ops apply */
-   memcpy_texture(dims,
+   memcpy_texture(ctx, dims,
                   dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                   dstRowStride, dstImageStride,
                   srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -1971,7 +1972,7 @@ _mesa_texstore_rgba_float32(STORE_PARAMS)
        baseInternalFormat == srcFormat &&
        srcType == GL_FLOAT) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,
@@ -2039,7 +2040,7 @@ _mesa_texstore_rgba_float16(STORE_PARAMS)
        baseInternalFormat == srcFormat &&
        srcType == GL_HALF_FLOAT_ARB) {
       /* simple memcpy path */
-      memcpy_texture(dims,
+      memcpy_texture(ctx, dims,
                      dstFormat, dstAddr, dstXoffset, dstYoffset, dstZoffset,
                      dstRowStride, dstImageStride,
                      srcWidth, srcHeight, srcDepth, srcFormat, srcType,