Add support for optimized versions of the code underlying ReadPixels

author Ian Romanick <idr@us.ibm.com>

Thu, 14 Oct 2004 00:59:12 +0000 (00:59 +0000)

committer Ian Romanick <idr@us.ibm.com>

Thu, 14 Oct 2004 00:59:12 +0000 (00:59 +0000)
author Ian Romanick <idr@us.ibm.com>
Thu, 14 Oct 2004 00:59:12 +0000 (00:59 +0000)
committer Ian Romanick <idr@us.ibm.com>
Thu, 14 Oct 2004 00:59:12 +0000 (00:59 +0000)
diff --git a/src/mesa/drivers/dri/common/spantmp2.h b/src/mesa/drivers/dri/common/spantmp2.h

new file mode 100644 (file)

index 0000000..ce0a66d
--- /dev/null
+++ b/src/mesa/drivers/dri/common/spantmp2.h
@@ -0,0 +1,564 @@
+/*
+ * Copyright 2000-2001 VA Linux Systems, Inc.
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEM, IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file spantmp2.h
+ *
+ * Template file of span read / write functions.
+ *
+ * \author Keith Whitwell <keithw@tungstengraphics.com>
+ * \author Gareth Hughes <gareth@nvidia.com>
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "colormac.h"
+
+#ifndef DBG
+#define DBG 0
+#endif
+
+#ifndef HW_WRITE_LOCK
+#define HW_WRITE_LOCK()                HW_LOCK()
+#endif
+
+#ifndef HW_WRITE_UNLOCK
+#define HW_WRITE_UNLOCK()      HW_UNLOCK()
+#endif
+
+#ifndef HW_READ_LOCK
+#define HW_READ_LOCK()         HW_LOCK()
+#endif
+
+#ifndef HW_READ_UNLOCK
+#define HW_READ_UNLOCK()       HW_UNLOCK()
+#endif
+
+#ifndef HW_READ_CLIPLOOP
+#define HW_READ_CLIPLOOP()     HW_CLIPLOOP()
+#endif
+
+#ifndef HW_WRITE_CLIPLOOP
+#define HW_WRITE_CLIPLOOP()    HW_CLIPLOOP()
+#endif
+
+#if (SPANTMP_PIXEL_FMT == GL_RGB)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)
+
+#define INIT_MONO_PIXEL(p, color) \
+  p = PACK_COLOR_565( color[0], color[1], color[2] )
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )                               \
+    do {                                                                \
+       GLshort * _p = (GLshort *) GET_DST_PTR(_x, _y);                  \
+       _p[0] = ((((int)r & 0xf8) << 8) | (((int)g & 0xfc) << 3) |      \
+                  (((int)b & 0xf8) >> 3));                             \
+   } while(0)
+
+#define WRITE_PIXEL( _x, _y, p )                                       \
+   do {                                                                 \
+      GLushort * _p = (GLushort *) GET_DST_PTR(_x, _y);                 \
+      _p[0] = p;                                                        \
+   } while(0)
+
+#define READ_RGBA( rgba, _x, _y )                                      \
+   do {                                                                        \
+      GLushort p = *(volatile GLshort *) GET_SRC_PTR(_x, _y);           \
+      rgba[0] = ((p >> 8) & 0xf8) * 255 / 0xf8;                                \
+      rgba[1] = ((p >> 3) & 0xfc) * 255 / 0xfc;                                \
+      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;                                \
+      rgba[3] = 0xff;                                                  \
+   } while (0)
+
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
+
+# define INIT_MONO_PIXEL(p, color)                       \
+     p = PACK_COLOR_8888(color[3], color[0], color[1], color[2]) 
+    
+# define WRITE_RGBA(_x, _y, r, g, b, a)                                 \
+    do {                                                                \
+       GLuint * _p = (GLuint *) GET_DST_PTR(_x, _y);                    \
+       _p[0] = ((r << 16) | (g << 8) | (b << 0) | (a << 24));           \
+    } while(0)
+
+#define WRITE_PIXEL(_x, _y, p)                                          \
+    do {                                                                \
+       GLuint * _p = (GLuint *) GET_DST_PTR(_x, _y);                    \
+       _p[0] = p;                                                       \
+    } while(0)
+
+# if defined( USE_X86_ASM )
+#  define READ_RGBA(rgba, _x, _y)                                       \
+    do {                                                                \
+        GLuint p = *(volatile GLuint *) GET_SRC_PTR(_x, _y);            \
+       __asm__ __volatile__( "bswap    %0; rorl $8, %0"                \
+                               : "=r" (p) : "r" (p) );                 \
+       ((GLuint *)rgba)[0] = p;                                         \
+    } while (0)
+# else
+#  define READ_RGBA( rgba, _x, _y )                                    \
+     do {                                                              \
+        GLuint p = *(volatile GLuint *) GET_SRC_PTR(_x, _y);            \
+       rgba[0] = (p >> 16) & 0xff;                                     \
+       rgba[1] = (p >>  8) & 0xff;                                     \
+       rgba[2] = (p >>  0) & 0xff;                                     \
+       rgba[3] = (p >> 24) & 0xff;                                     \
+     } while (0)
+# endif
+
+#else
+#error SPANTMP_PIXEL_FMT must be set to a valid value!
+#endif
+
+#if defined( USE_MMX_ASM ) || defined( USE_SSE_ASM )
+#include "x86/read_rgba_span_x86.h"
+#include "x86/common_x86_asm.h"
+#endif
+
+static void TAG(WriteRGBASpan)( const GLcontext *ctx,
+                               GLuint n, GLint x, GLint y,
+                               const GLubyte rgba[][4],
+                               const GLubyte mask[] )
+{
+   HW_WRITE_LOCK()
+      {
+        GLint x1;
+        GLint n1;
+        LOCAL_VARS;
+
+        y = Y_FLIP(y);
+
+        HW_WRITE_CLIPLOOP()
+           {
+              GLint i = 0;
+              CLIPSPAN(x,y,n,x1,n1,i);
+
+              if (DBG) fprintf(stderr, "WriteRGBASpan %d..%d (x1 %d)\n",
+                               (int)i, (int)n1, (int)x1);
+
+              if (mask)
+              {
+                 for (;n1>0;i++,x1++,n1--)
+                    if (mask[i])
+                       WRITE_RGBA( x1, y,
+                                   rgba[i][0], rgba[i][1],
+                                   rgba[i][2], rgba[i][3] );
+              }
+              else
+              {
+                 for (;n1>0;i++,x1++,n1--)
+                    WRITE_RGBA( x1, y,
+                                rgba[i][0], rgba[i][1],
+                                rgba[i][2], rgba[i][3] );
+              }
+           }
+        HW_ENDCLIPLOOP();
+      }
+   HW_WRITE_UNLOCK();
+}
+
+static void TAG(WriteRGBSpan)( const GLcontext *ctx,
+                              GLuint n, GLint x, GLint y,
+                              const GLubyte rgb[][3],
+                              const GLubyte mask[] )
+{
+   HW_WRITE_LOCK()
+      {
+        GLint x1;
+        GLint n1;
+        LOCAL_VARS;
+
+        y = Y_FLIP(y);
+
+        HW_WRITE_CLIPLOOP()
+           {
+              GLint i = 0;
+              CLIPSPAN(x,y,n,x1,n1,i);
+
+              if (DBG) fprintf(stderr, "WriteRGBSpan %d..%d (x1 %d)\n",
+                               (int)i, (int)n1, (int)x1);
+
+              if (mask)
+              {
+                 for (;n1>0;i++,x1++,n1--)
+                    if (mask[i])
+                       WRITE_RGBA( x1, y, rgb[i][0], rgb[i][1], rgb[i][2], 255 );
+              }
+              else
+              {
+                 for (;n1>0;i++,x1++,n1--)
+                    WRITE_RGBA( x1, y, rgb[i][0], rgb[i][1], rgb[i][2], 255 );
+              }
+           }
+        HW_ENDCLIPLOOP();
+      }
+   HW_WRITE_UNLOCK();
+}
+
+static void TAG(WriteRGBAPixels)( const GLcontext *ctx,
+                              GLuint n,
+                              const GLint x[],
+                              const GLint y[],
+                              const GLubyte rgba[][4],
+                              const GLubyte mask[] )
+{
+   HW_WRITE_LOCK()
+      {
+        GLint i;
+        LOCAL_VARS;
+
+        if (DBG) fprintf(stderr, "WriteRGBAPixels\n");
+
+        HW_WRITE_CLIPLOOP()
+           {
+              if (mask)
+              {
+                 for (i=0;i<n;i++)
+                 {
+                    if (mask[i]) {
+                       const int fy = Y_FLIP(y[i]);
+                       if (CLIPPIXEL(x[i],fy))
+                          WRITE_RGBA( x[i], fy,
+                                      rgba[i][0], rgba[i][1],
+                                      rgba[i][2], rgba[i][3] );
+                    }
+                 }
+              }
+              else
+              {
+                 for (i=0;i<n;i++)
+                 {
+                    const int fy = Y_FLIP(y[i]);
+                    if (CLIPPIXEL(x[i],fy))
+                       WRITE_RGBA( x[i], fy,
+                                   rgba[i][0], rgba[i][1],
+                                   rgba[i][2], rgba[i][3] );
+                 }
+              }
+           }
+        HW_ENDCLIPLOOP();
+      }
+   HW_WRITE_UNLOCK();
+}
+
+
+static void TAG(WriteMonoRGBASpan)( const GLcontext *ctx,      
+                                   GLuint n, GLint x, GLint y, 
+                                   const GLchan color[4],
+                                   const GLubyte mask[] )
+{
+   HW_WRITE_LOCK()
+      {
+        GLint x1;
+        GLint n1;
+        LOCAL_VARS;
+        INIT_MONO_PIXEL(p, color);
+
+        y = Y_FLIP( y );
+
+        if (DBG) fprintf(stderr, "WriteMonoRGBASpan\n");
+
+        HW_WRITE_CLIPLOOP()
+           {
+              GLint i = 0;
+              CLIPSPAN(x,y,n,x1,n1,i);
+              if (mask)
+              {
+                 for (;n1>0;i++,x1++,n1--)
+                    if (mask[i])
+                       WRITE_PIXEL( x1, y, p );
+              }
+              else
+              {
+                 for (;n1>0;i++,x1++,n1--)
+                    WRITE_PIXEL( x1, y, p );
+              }
+           }
+        HW_ENDCLIPLOOP();
+      }
+   HW_WRITE_UNLOCK();
+}
+
+
+static void TAG(WriteMonoRGBAPixels)( const GLcontext *ctx,
+                                     GLuint n,
+                                     const GLint x[], const GLint y[],
+                                     const GLchan color[],
+                                     const GLubyte mask[] ) 
+{
+   HW_WRITE_LOCK()
+      {
+        GLint i;
+        LOCAL_VARS;
+        INIT_MONO_PIXEL(p, color);
+
+        if (DBG) fprintf(stderr, "WriteMonoRGBAPixels\n");
+
+        HW_WRITE_CLIPLOOP()
+           {
+              if (mask)
+              {
+                 for (i=0;i<n;i++)
+                    if (mask[i]) {
+                       int fy = Y_FLIP(y[i]);
+                       if (CLIPPIXEL( x[i], fy ))
+                          WRITE_PIXEL( x[i], fy, p );
+                    }
+              }
+              else
+              {
+                 for (i=0;i<n;i++) {
+                    int fy = Y_FLIP(y[i]);
+                    if (CLIPPIXEL( x[i], fy ))
+                       WRITE_PIXEL( x[i], fy, p );
+                 }
+              }
+           }
+        HW_ENDCLIPLOOP();
+      }
+   HW_WRITE_UNLOCK();
+}
+
+
+static void TAG(ReadRGBASpan)( const GLcontext *ctx,
+                              GLuint n, GLint x, GLint y,
+                              GLubyte rgba[][4])
+{
+   HW_READ_LOCK()
+      {
+        GLint x1,n1;
+        LOCAL_VARS;
+
+        y = Y_FLIP(y);
+
+        if (DBG) fprintf(stderr, "ReadRGBASpan\n");
+
+        HW_READ_CLIPLOOP()
+           {
+              GLint i = 0;
+              CLIPSPAN(x,y,n,x1,n1,i);
+              for (;n1>0;i++,x1++,n1--)
+                 READ_RGBA( rgba[i], x1, y );
+           }
+         HW_ENDCLIPLOOP();
+      }
+   HW_READ_UNLOCK();
+}
+
+
+#if defined(USE_MMX_ASM)
+static void TAG2(ReadRGBASpan,_MMX)( const GLcontext *ctx,
+                              GLuint n, GLint x, GLint y,
+                              GLubyte rgba[][4])
+{
+#ifndef USE_INNER_EMMS
+   /* The EMMS instruction is directly in-lined here because using GCC's
+    * built-in _mm_empty function was found to utterly destroy performance.
+    */
+   __asm__ __volatile__( "emms" );
+#endif
+
+   HW_LOCK()
+     {
+       GLint x1,n1;
+       LOCAL_VARS;
+
+       y = Y_FLIP(y);
+
+       if (DBG) fprintf(stderr, "ReadRGBASpan\n");
+
+       HW_READ_CLIPLOOP()
+         {
+            GLint i = 0;
+            CLIPSPAN(x,y,n,x1,n1,i);
+
+              {
+                 const char * src = (read_buf + x1*4 + y*pitch);
+                 _generic_read_RGBA_span_BGRA8888_REV_MMX( src, rgba[i], n1 );
+              }
+         }
+       HW_ENDCLIPLOOP();
+     }
+   HW_UNLOCK();
+#ifndef USE_INNER_EMMS
+   __asm__ __volatile__( "emms" );
+#endif
+}
+#endif
+
+
+#if defined(USE_SSE_ASM)
+static void TAG2(ReadRGBASpan,_SSE2)( const GLcontext *ctx,
+                              GLuint n, GLint x, GLint y,
+                              GLubyte rgba[][4])
+{
+   HW_LOCK()
+     {
+       GLint x1,n1;
+       LOCAL_VARS;
+
+       y = Y_FLIP(y);
+
+       if (DBG) fprintf(stderr, "ReadRGBASpan\n");
+
+       HW_READ_CLIPLOOP()
+         {
+            GLint i = 0;
+            CLIPSPAN(x,y,n,x1,n1,i);
+
+              {
+                 const char * src = (read_buf + x1*4 + y*pitch);
+                 _generic_read_RGBA_span_BGRA8888_REV_SSE2( src, rgba[i], n1 );
+              }
+         }
+       HW_ENDCLIPLOOP();
+     }
+   HW_UNLOCK();
+}
+#endif
+
+#if defined(USE_SSE_ASM)
+static void TAG2(ReadRGBASpan,_SSE)( const GLcontext *ctx,
+                              GLuint n, GLint x, GLint y,
+                              GLubyte rgba[][4])
+{
+#ifndef USE_INNER_EMMS
+   /* The EMMS instruction is directly in-lined here because using GCC's
+    * built-in _mm_empty function was found to utterly destroy performance.
+    */
+   __asm__ __volatile__( "emms" );
+#endif
+
+   HW_LOCK()
+     {
+       GLint x1,n1;
+       LOCAL_VARS;
+
+       y = Y_FLIP(y);
+
+       if (DBG) fprintf(stderr, "ReadRGBASpan\n");
+
+       HW_READ_CLIPLOOP()
+         {
+            GLint i = 0;
+            CLIPSPAN(x,y,n,x1,n1,i);
+
+              {
+                 const char * src = (read_buf + x1*4 + y*pitch);
+                 _generic_read_RGBA_span_BGRA8888_REV_SSE( src, rgba[i], n1 );
+              }
+         }
+       HW_ENDCLIPLOOP();
+     }
+   HW_UNLOCK();
+#ifndef USE_INNER_EMMS
+   __asm__ __volatile__( "emms" );
+#endif
+}
+#endif
+
+
+static void TAG(ReadRGBAPixels)( const GLcontext *ctx,
+                                GLuint n, const GLint x[], const GLint y[],
+                                GLubyte rgba[][4], const GLubyte mask[] )
+{
+   HW_READ_LOCK()
+      {
+        GLint i;
+        LOCAL_VARS;
+
+        if (DBG) fprintf(stderr, "ReadRGBAPixels\n");
+
+        HW_READ_CLIPLOOP()
+           {
+              if (mask)
+              {
+                 for (i=0;i<n;i++)
+                    if (mask[i]) {
+                       int fy = Y_FLIP( y[i] );
+                       if (CLIPPIXEL( x[i], fy ))
+                          READ_RGBA( rgba[i], x[i], fy );
+                    }
+              }
+              else
+              {
+                 for (i=0;i<n;i++) {
+                    int fy = Y_FLIP( y[i] );
+                    if (CLIPPIXEL( x[i], fy ))
+                       READ_RGBA( rgba[i], x[i], fy );
+                 }
+              }
+           }
+        HW_ENDCLIPLOOP();
+      }
+   HW_READ_UNLOCK();
+}
+
+static void TAG(InitPointers)(struct swrast_device_driver *swdd)
+{
+   swdd->WriteRGBASpan = TAG(WriteRGBASpan);
+   swdd->WriteRGBSpan = TAG(WriteRGBSpan);
+   swdd->WriteMonoRGBASpan = TAG(WriteMonoRGBASpan);
+   swdd->WriteRGBAPixels = TAG(WriteRGBAPixels);
+   swdd->WriteMonoRGBAPixels = TAG(WriteMonoRGBAPixels);
+   swdd->ReadRGBAPixels = TAG(ReadRGBAPixels);
+
+#if defined(USE_SSE_ASM)
+   if ( cpu_has_xmm2 ) {
+      if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE2" );
+      swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE2);
+   }
+   else
+#endif
+#if defined(USE_SSE_ASM)
+   if ( cpu_has_xmm ) {
+      if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE" );
+      swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE);
+   }
+   else
+#endif
+#if defined(USE_MMX_ASM)
+   if ( cpu_has_mmx ) {
+      if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "MMX" );
+      swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _MMX);
+   }
+   else
+#endif
+   {
+      if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "C" );
+      swdd->ReadRGBASpan = TAG(ReadRGBASpan);
+   }
+
+}
+
+
+#undef INIT_MONO_PIXEL
+#undef WRITE_PIXEL
+#undef WRITE_RGBA
+#undef READ_RGBA
+#undef TAG
+#undef TAG2
+#undef GET_SRC_PTR
+#undef GET_DST_PTR
+#undef SPANTMP_PIXEL_FMT
+#undef SPANTMP_PIXEL_TYPE
diff --git a/src/mesa/drivers/dri/r128/r128_span.c b/src/mesa/drivers/dri/r128/r128_span.c

index db2ec44fbf2ab5689d922149c269c3c9bb8bc44e..b169dc7509ec944cbb3089b1eda4336a95dd3e45 100644 (file)
--- a/src/mesa/drivers/dri/r128/r128_span.c
+++ b/src/mesa/drivers/dri/r128/r128_span.c
@@ -125,85 +125,27 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  
  /* 16 bit, RGB565 color spanline and pixel functions
   */
-#undef INIT_MONO_PIXEL
-#define INIT_MONO_PIXEL(p, color) \
-  p = R128PACKCOLOR565( color[0], color[1], color[2] )
+#define GET_SRC_PTR(_x, _y) (read_buf + _x * 2 + _y * pitch)
+#define GET_DST_PTR(_x, _y) (     buf + _x * 2 + _y * pitch)
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
  
-#define WRITE_RGBA( _x, _y, r, g, b, a )                               \
-   *(GLushort *)(buf + _x*2 + _y*pitch) = ((((int)r & 0xf8) << 8) |    \
-                                          (((int)g & 0xfc) << 3) |     \
-                                          (((int)b & 0xf8) >> 3))
+#define TAG(x)    r128##x##_RGB565
+#define TAG2(x,y) r128##x##_RGB565##y
+#include "spantmp2.h"
  
-#define WRITE_PIXEL( _x, _y, p )                                       \
-   *(GLushort *)(buf + _x*2 + _y*pitch) = p
-
-#define READ_RGBA( rgba, _x, _y )                                      \
-   do {                                                                        \
-      GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch);          \
-      rgba[0] = (p >> 8) & 0xf8;                                       \
-      rgba[1] = (p >> 3) & 0xfc;                                       \
-      rgba[2] = (p << 3) & 0xf8;                                       \
-      rgba[3] = 0xff;                                                  \
-      if ( rgba[0] & 0x08 ) rgba[0] |= 0x07;                           \
-      if ( rgba[1] & 0x04 ) rgba[1] |= 0x03;                           \
-      if ( rgba[2] & 0x08 ) rgba[2] |= 0x07;                           \
-   } while (0)
-
-#define TAG(x) r128##x##_RGB565
-#include "spantmp.h"
-
-#define READ_DEPTH(d, _x, _y)                                                 \
-    d = *(GLushort *)(buf + _x*2 + _y*pitch)
  
  /* 32 bit, ARGB8888 color spanline and pixel functions
   */
-#undef INIT_MONO_PIXEL
-#define INIT_MONO_PIXEL(p, color) \
-  p = R128PACKCOLOR8888( color[0], color[1], color[2], color[3] )
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )                               \
-   *(GLuint *)(buf + _x*4 + _y*pitch) = ((b <<  0) |                   \
-                                        (g <<  8) |                    \
-                                        (r << 16) |                    \
-                                        (a << 24) )
+#define GET_SRC_PTR(_x, _y) (read_buf + _x * 4 + _y * pitch)
+#define GET_DST_PTR(_x, _y) (     buf + _x * 4 + _y * pitch)
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
  
-#define WRITE_PIXEL( _x, _y, p )                                       \
-   *(GLuint *)(buf + _x*4 + _y*pitch) = p
+#define TAG(x)    r128##x##_ARGB8888
+#define TAG2(x,y) r128##x##_ARGB8888##y
+#include "spantmp2.h"
  
-#define READ_RGBA( rgba, _x, _y )                                      \
-do {                                                                   \
-   GLuint p = *(GLuint *)(read_buf + _x*4 + _y*pitch);                 \
-   rgba[0] = (p >> 16) & 0xff;                                         \
-   rgba[1] = (p >>  8) & 0xff;                                         \
-   rgba[2] = (p >>  0) & 0xff;                                         \
-   rgba[3] = 0xff;/*(p >> 24) & 0xff;*/                                                \
-} while (0)
-
-#define TAG(x) r128##x##_ARGB8888
-#include "spantmp.h"
-
-
-/* 24 bit, RGB888 color spanline and pixel functions */
-#undef INIT_MONO_PIXEL
-#define INIT_MONO_PIXEL(p, color) \
-  p = R128PACKCOLOR888( color[0], color[1], color[2] )
-
-#define WRITE_RGBA(_x, _y, r, g, b, a)                                        \
-    *(GLuint *)(buf + _x*3 + _y*pitch) = ((r << 16) |                         \
-                                         (g << 8)  |                         \
-                                         (b << 0))
-
-#define WRITE_PIXEL(_x, _y, p)                                                \
-    *(GLuint *)(buf + _x*3 + _y*pitch) = p
-
-#define READ_RGBA(rgba, _x, _y)                                               \
-    do {                                                                      \
-       GLuint p = *(GLuint *)(read_buf + _x*3 + _y*pitch);                   \
-       rgba[0] = (p >> 16) & 0xff;                                           \
-       rgba[1] = (p >> 8)  & 0xff;                                           \
-       rgba[2] = (p >> 0)  & 0xff;                                           \
-       rgba[3] = 0xff;                                                       \
-    } while (0)
  
  /* ================================================================
   * Depth buffer
@@ -211,6 +153,9 @@ do {                                                                        \
  
  /* 16-bit depth buffer functions
   */
+#define READ_DEPTH(d, _x, _y)                                           \
+    d = *(GLushort *)(buf + _x*2 + _y*pitch)
+
  #define WRITE_DEPTH_SPAN()                                             \
     r128WriteDepthSpanLocked( rmesa, n,                                 \
                              x + dPriv->x,                              \
@@ -423,23 +368,11 @@ void r128DDInitSpanFuncs( GLcontext *ctx )
  
     switch ( rmesa->r128Screen->cpp ) {
     case 2:
-      swdd->WriteRGBASpan      = r128WriteRGBASpan_RGB565;
-      swdd->WriteRGBSpan       = r128WriteRGBSpan_RGB565;
-      swdd->WriteMonoRGBASpan  = r128WriteMonoRGBASpan_RGB565;
-      swdd->WriteRGBAPixels    = r128WriteRGBAPixels_RGB565;
-      swdd->WriteMonoRGBAPixels        = r128WriteMonoRGBAPixels_RGB565;
-      swdd->ReadRGBASpan       = r128ReadRGBASpan_RGB565;
-      swdd->ReadRGBAPixels     = r128ReadRGBAPixels_RGB565;
+      r128InitPointers_RGB565( swdd );
        break;
  
     case 4:
-      swdd->WriteRGBASpan      = r128WriteRGBASpan_ARGB8888;
-      swdd->WriteRGBSpan       = r128WriteRGBSpan_ARGB8888;
-      swdd->WriteMonoRGBASpan  = r128WriteMonoRGBASpan_ARGB8888;
-      swdd->WriteRGBAPixels    = r128WriteRGBAPixels_ARGB8888;
-      swdd->WriteMonoRGBAPixels        = r128WriteMonoRGBAPixels_ARGB8888;
-      swdd->ReadRGBASpan       = r128ReadRGBASpan_ARGB8888;
-      swdd->ReadRGBAPixels     = r128ReadRGBAPixels_ARGB8888;
+      r128InitPointers_ARGB8888( swdd );
        break;
  
     default:
diff --git a/src/mesa/drivers/dri/r200/r200_span.c b/src/mesa/drivers/dri/r200/r200_span.c

index e1ad976ce3b8e333cf7ab3136c234d5e979df012..efb5e6ea7c5f97e398d3327181439ee74d1dd977 100644 (file)
--- a/src/mesa/drivers/dri/r200/r200_span.c
+++ b/src/mesa/drivers/dri/r200/r200_span.c
@@ -122,61 +122,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  
  /* 16 bit, RGB565 color spanline and pixel functions
   */
-#define INIT_MONO_PIXEL(p, color) \
-  p = PACK_COLOR_565( color[0], color[1], color[2] )
  
-#define WRITE_RGBA( _x, _y, r, g, b, a )                               \
-   *(GLushort *)(buf + _x*2 + _y*pitch) = ((((int)r & 0xf8) << 8) |    \
-                                          (((int)g & 0xfc) << 3) |     \
-                                          (((int)b & 0xf8) >> 3))
+#define GET_SRC_PTR(_x, _y) (read_buf + _x * 2 + _y * pitch)
+#define GET_DST_PTR(_x, _y) (     buf + _x * 2 + _y * pitch)
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
  
-#define WRITE_PIXEL( _x, _y, p )                                       \
-   *(GLushort *)(buf + _x*2 + _y*pitch) = p
-
-#define READ_RGBA( rgba, _x, _y )                                      \
-   do {                                                                        \
-      GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch);          \
-      rgba[0] = ((p >> 8) & 0xf8) * 255 / 0xf8;                                \
-      rgba[1] = ((p >> 3) & 0xfc) * 255 / 0xfc;                                \
-      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;                                \
-      rgba[3] = 0xff;                                                  \
-   } while (0)
-
-#define TAG(x) r200##x##_RGB565
-#include "spantmp.h"
+#define TAG(x)    r200##x##_RGB565
+#define TAG2(x,y) r200##x##_RGB565##y
+#include "spantmp2.h"
  
  /* 32 bit, ARGB8888 color spanline and pixel functions
   */
-#undef INIT_MONO_PIXEL
-#define INIT_MONO_PIXEL(p, color) \
-  p = PACK_COLOR_8888( color[3], color[0], color[1], color[2] )
-
-#define WRITE_RGBA( _x, _y, r, g, b, a )                       \
-do {                                                           \
-   *(GLuint *)(buf + _x*4 + _y*pitch) = ((b <<  0) |           \
-                                        (g <<  8) |            \
-                                        (r << 16) |            \
-                                        (a << 24) );           \
-} while (0)
-
-#define WRITE_PIXEL( _x, _y, p )                       \
-do {                                                   \
-   *(GLuint *)(buf + _x*4 + _y*pitch) = p;             \
-} while (0)
-
-#define READ_RGBA( rgba, _x, _y )                              \
-do {                                                           \
-   volatile GLuint *ptr = (volatile GLuint *)(read_buf + _x*4 + _y*pitch); \
-   GLuint p = *ptr;                                    \
-   rgba[0] = (p >> 16) & 0xff;                                 \
-   rgba[1] = (p >>  8) & 0xff;                                 \
-   rgba[2] = (p >>  0) & 0xff;                                 \
-   rgba[3] = (p >> 24) & 0xff;                                 \
-} while (0)
  
-#define TAG(x) r200##x##_ARGB8888
-#include "spantmp.h"
+#define GET_SRC_PTR(_x, _y) (read_buf + _x * 4 + _y * pitch)
+#define GET_DST_PTR(_x, _y) (     buf + _x * 4 + _y * pitch)
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
  
+#define TAG(x)    r200##x##_ARGB8888
+#define TAG2(x,y) r200##x##_ARGB8888##y
+#include "spantmp2.h"
  
  
  /* ================================================================
@@ -380,23 +346,11 @@ void r200InitSpanFuncs( GLcontext *ctx )
  
     switch ( rmesa->r200Screen->cpp ) {
     case 2:
-      swdd->WriteRGBASpan      = r200WriteRGBASpan_RGB565;
-      swdd->WriteRGBSpan       = r200WriteRGBSpan_RGB565;
-      swdd->WriteMonoRGBASpan  = r200WriteMonoRGBASpan_RGB565;
-      swdd->WriteRGBAPixels    = r200WriteRGBAPixels_RGB565;
-      swdd->WriteMonoRGBAPixels        = r200WriteMonoRGBAPixels_RGB565;
-      swdd->ReadRGBASpan       = r200ReadRGBASpan_RGB565;
-      swdd->ReadRGBAPixels      = r200ReadRGBAPixels_RGB565;
+      r200InitPointers_RGB565( swdd );
        break;
  
     case 4:
-      swdd->WriteRGBASpan      = r200WriteRGBASpan_ARGB8888;
-      swdd->WriteRGBSpan       = r200WriteRGBSpan_ARGB8888;
-      swdd->WriteMonoRGBASpan   = r200WriteMonoRGBASpan_ARGB8888;
-      swdd->WriteRGBAPixels     = r200WriteRGBAPixels_ARGB8888;
-      swdd->WriteMonoRGBAPixels = r200WriteMonoRGBAPixels_ARGB8888;
-      swdd->ReadRGBASpan       = r200ReadRGBASpan_ARGB8888;
-      swdd->ReadRGBAPixels      = r200ReadRGBAPixels_ARGB8888;
+      r200InitPointers_ARGB8888( swdd );
        break;
  
     default:
diff --git a/src/mesa/drivers/dri/unichrome/via_span.c b/src/mesa/drivers/dri/unichrome/via_span.c

index 693b6de1428d6990f34eb87c5bd9369a472b0b43..3a747a3d99991904bcb315b6120128b6037bece5 100644 (file)
--- a/src/mesa/drivers/dri/unichrome/via_span.c
+++ b/src/mesa/drivers/dri/unichrome/via_span.c
@@ -204,19 +204,6 @@
  #undef LOCAL_VARS
  #undef LOCAL_DEPTH_VARS
   
-/*=* [DBG] csmash : fix options worng position *=*/
-/*#define LOCAL_VARS                                    \
-    __DRIdrawablePrivate *dPriv = vmesa->driDrawable;   \
-    GLuint pitch = vmesa->drawPitch;                    \
-    GLuint height = dPriv->h;                           \
-    GLuint p;                                           \
-    char *buf = (char *)(vmesa->drawMap +               \
-                         dPriv->x * 4 +                 \
-                         dPriv->y * pitch);             \
-    char *read_buf = (char *)(vmesa->readMap +          \
-                              dPriv->x * 4 +            \
-                              dPriv->y * pitch);        \
-    (void)read_buf; (void)buf; (void)p*/
  #define LOCAL_VARS                                                     \
      __DRIdrawablePrivate *dPriv = vmesa->driDrawable;                  \
      GLuint pitch = vmesa->drawPitch;                                   \
@@ -237,33 +224,15 @@
                                dPriv->y * pitch);                       \
      }
  
+#define GET_SRC_PTR(_x, _y) (read_buf + _x * 4 + _y * pitch)
+#define GET_DST_PTR(_x, _y) (     buf + _x * 4 + _y * pitch)
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
  
-#undef INIT_MONO_PIXEL
-#define INIT_MONO_PIXEL(p, color)                       \
-    p = PACK_COLOR_8888(color[3], color[0], color[1], color[2]) 
-    
-#define WRITE_RGBA(_x, _y, r, g, b, a)                                  \
-    *(GLuint *)(buf + _x * 4 + _y * pitch) = ((r << 16) |              \
-                                              (g << 8) |               \
-                                              (b << 0) |               \
-                                              (a << 24));
-                                              
-
-#define WRITE_PIXEL(_x, _y, p)                      \
-    *(GLuint *)(buf + _x * 4 + _y * pitch) = p
-
-#define READ_RGBA(rgba, _x, _y)                                         \
-    do {                                                                \
-        GLuint p = *(GLuint *)(read_buf + _x * 4 + _y * pitch);         \
-        rgba[0] = (p >> 16) & 0xff;                                    \
-        rgba[1] = (p >> 8) & 0xff;                                     \
-        rgba[2] = (p >> 0) & 0xff;                                     \
-        rgba[3] = 255;                                                  \
-    } while (0)
+#define TAG(x)    via##x##_8888
+#define TAG2(x,y) via##x##_8888##y
+#include "spantmp2.h"
  
-#define TAG(x) via##x##_8888
-#include "spantmp.h"
-/*#include "via_spantmp.h"*/
  
  /* 16 bit depthbuffer functions.
   */
@@ -367,13 +336,7 @@ void viaInitSpanFuncs(GLcontext *ctx)
         swdd->ReadRGBAPixels = viaReadRGBAPixels_565;
      }
      else if (vmesa->viaScreen->bitsPerPixel == 0x20) {
-       swdd->WriteRGBASpan = viaWriteRGBASpan_8888;
-       swdd->WriteRGBSpan = viaWriteRGBSpan_8888;
-       swdd->WriteMonoRGBASpan = viaWriteMonoRGBASpan_8888;
-       swdd->WriteRGBAPixels = viaWriteRGBAPixels_8888;
-       swdd->WriteMonoRGBAPixels = viaWriteMonoRGBAPixels_8888;
-       swdd->ReadRGBASpan = viaReadRGBASpan_8888;
-       swdd->ReadRGBAPixels = viaReadRGBAPixels_8888;
+       viaInitPointers_8888( swdd );
      }
      else 
         ASSERT(0);
diff --git a/src/mesa/sources b/src/mesa/sources

index 7013182f6f26c04908dbcbd78b82739264193b0f..f82d344bf582605d4d54578c20a3fc5ff8a451b5 100644 (file)
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -170,6 +170,7 @@ X86_SOURCES =                       \
         x86/sse_xform3.S        \
         x86/sse_xform4.S        \
         x86/sse_normal.S \
+       x86/read_rgba_span_x86.S \
         tnl/t_vtx_x86_gcc.S
  
  X86_API =                      \
diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S

new file mode 100644 (file)

index 0000000..e637f22
--- /dev/null
+++ b/src/mesa/x86/read_rgba_span_x86.S
@@ -0,0 +1,453 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+ 
+/**
+ * \file read_rgba_span_x86.S
+ * Optimized routines to transfer pixel data from the framebuffer to a
+ * buffer in main memory.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+       .file   "read_rgba_span_x86.S"
+       .section        .rodata
+       .align 16
+       .type   mask, @object
+       .size   mask, 32
+mask:
+       .long   0xff00ff00
+       .long   0xff00ff00
+       .long   0xff00ff00
+       .long   0xff00ff00
+       .long   0x00ff0000
+       .long   0x00ff0000
+       .long   0x00ff0000
+       .long   0x00ff0000
+
+
+/* I implemented these as macros because the appear in quite a few places,
+ * and I've tweaked them a number of times.  I got tired of changing every
+ * place they appear. :)
+ */
+
+#define DO_ONE_PIXEL() \
+       movl    (%ebx), %eax ; \
+       addl    $4, %ebx ; \
+       bswap   %eax          /* ARGB -> BGRA */ ; \
+       rorl    $8, %eax      /* BGRA -> ABGR */ ; \
+       movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+       addl    $4, %ecx
+
+#define DO_ONE_LAST_PIXEL() \
+       movl    (%ebx), %eax ; \
+       bswap   %eax          /* ARGB -> BGRA */ ; \
+       rorl    $8, %eax      /* BGRA -> ABGR */ ; \
+       movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+
+
+/**
+ * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
+ * 
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+       .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
+_generic_read_RGBA_span_BGRA8888_REV_MMX:
+       pushl   %ebx
+
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+       movq    mask, %mm1
+       movq    mask+16, %mm2
+
+       movl    8(%esp), %ebx   /* source pointer */
+       movl    16(%esp), %edx  /* number of pixels to copy */
+       movl    12(%esp), %ecx  /* destination pointer */
+
+       testl   %edx, %edx
+       je      .L20            /* Bail if there's nothing to do. */
+
+       movl    %ebx, %eax
+
+       negl    %eax
+       sarl    $2, %eax
+       andl    $1, %eax
+       je      .L17
+
+       subl    %eax, %edx
+       DO_ONE_PIXEL()
+.L17:
+
+       /* Would it be faster to unroll this loop once and process 4 pixels
+        * per pass, instead of just two?
+        */
+
+       movl    %edx, %eax
+       shrl    %eax
+       jmp     .L18
+.L19:
+       movq    (%ebx), %mm0
+       addl    $8, %ebx
+
+       /* These 9 instructions do what PSHUFB (if there were such an
+        * instruction) could do in 1. :(
+        */
+
+       movq    %mm0, %mm3
+       movq    %mm0, %mm4
+
+       pand    %mm2, %mm3
+       psllq   $16, %mm4
+       psrlq   $16, %mm3
+       pand    %mm2, %mm4
+
+       pand    %mm1, %mm0
+       por     %mm4, %mm3
+       por     %mm3, %mm0
+
+       movq    %mm0, (%ecx)
+       addl    $8, %ecx
+       subl    $1, %eax
+.L18:
+       jne     .L19
+
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+
+       /* At this point there are either 1 or 0 pixels remaining to be
+        * converted.  Convert the last pixel, if needed.
+        */
+
+       testl   $1, %edx
+       je      .L20
+
+       DO_ONE_LAST_PIXEL()
+
+.L20:
+       popl    %ebx
+       ret
+       .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
+
+
+/**
+ * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
+ * instructions are only actually used to read data from the framebuffer.
+ * In practice, the speed-up is pretty small.
+ *
+ * \todo
+ * Do some more testing and determine if there's any reason to have this
+ * function in addition to the MMX version.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+       .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE:
+       pushl   %esi
+       pushl   %ebx
+       pushl   %ebp
+
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+       movq    mask, %mm1
+       movq    mask+16, %mm2
+
+       movl    16(%esp), %ebx  /* source pointer */
+       movl    24(%esp), %edx  /* number of pixels to copy */
+       movl    20(%esp), %ecx  /* destination pointer */
+
+       movl    %esp, %ebp
+       subl    $16, %esp
+       andl    $0xfffffff0, %esp
+
+       movl    %ebx, %eax
+       movl    %edx, %esi
+
+       negl    %eax
+       andl    $15, %eax
+       sarl    $2, %eax
+       cmpl    %edx, %eax
+       cmovle  %eax, %esi
+
+       subl    %esi, %edx
+
+       testl   $1, %esi
+       je      .L32
+
+       DO_ONE_PIXEL()
+.L32:
+
+       testl   $2, %esi
+       je      .L31
+
+       movq    (%ebx), %mm0
+       addl    $8, %ebx
+
+       movq    %mm0, %mm3
+       movq    %mm0, %mm4
+       
+       pand    %mm2, %mm3
+       psllq   $16, %mm4
+       psrlq   $16, %mm3
+       pand    %mm2, %mm4
+
+       pand    %mm1, %mm0
+       por     %mm4, %mm3
+       por     %mm3, %mm0
+
+       movq    %mm0, (%ecx)
+       addl    $8, %ecx
+.L31:
+
+       movl    %edx, %eax
+       shrl    $2, %eax
+       jmp     .L33
+.L34:
+       movaps  (%ebx), %xmm0
+       addl    $16, %ebx
+
+       /* This would be so much better if we could just move directly from
+        * an SSE register to an MMX register.  Unfortunately, that
+        * functionality wasn't introduced until SSE2 with the MOVDQ2Q
+        * instruction.
+        */
+
+       movaps  %xmm0, (%esp)
+       movq    (%esp), %mm0
+       movq    8(%esp), %mm5
+
+       movq    %mm0, %mm3
+       movq    %mm0, %mm4
+       movq    %mm5, %mm6
+       movq    %mm5, %mm7
+
+       pand    %mm2, %mm3
+       pand    %mm2, %mm6
+
+       psllq   $16, %mm4
+       psllq   $16, %mm7
+
+       psrlq   $16, %mm3
+       psrlq   $16, %mm6
+
+       pand    %mm2, %mm4
+       pand    %mm2, %mm7
+
+       pand    %mm1, %mm0
+       pand    %mm1, %mm5
+
+       por     %mm4, %mm3
+       por     %mm7, %mm6
+
+       por     %mm3, %mm0
+       por     %mm6, %mm5
+
+       movq    %mm0, (%ecx)
+       movq    %mm5, 8(%ecx)
+       addl    $16, %ecx
+
+       subl    $1, %eax
+.L33:
+       jne     .L34
+
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+       movl    %ebp, %esp
+
+       /* At this point there are either [0, 3] pixels remaining to be
+        * converted.
+        */
+
+       testl   $2, %edx
+       je      .L36
+
+       movq    (%ebx), %mm0
+       addl    $8, %ebx
+
+       movq    %mm0, %mm3
+       movq    %mm0, %mm4
+       
+       pand    %mm2, %mm3
+       psllq   $16, %mm4
+       psrlq   $16, %mm3
+       pand    %mm2, %mm4
+
+       pand    %mm1, %mm0
+       por     %mm4, %mm3
+       por     %mm3, %mm0
+
+       movq    %mm0, (%ecx)
+       addl    $8, %ecx
+.L36:
+
+       testl   $1, %edx
+       je      .L35
+
+       DO_ONE_LAST_PIXEL()
+.L35:
+       popl    %ebp
+       popl    %ebx
+       popl    %esi
+       ret
+       .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
+
+
+/**
+ * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
+ */
+
+       .text
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+       .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE2:
+       pushl   %esi
+       pushl   %ebx
+
+       movdqa  mask, %xmm1
+       movdqa  mask+16, %xmm2
+
+       movl    12(%esp), %ebx  /* source pointer */
+       movl    20(%esp), %edx  /* number of pixels to copy */
+       movl    16(%esp), %ecx  /* destination pointer */
+
+       movl    %ebx, %eax
+       movl    %edx, %esi
+
+       /* If the source pointer isn't a multiple of 16 we have to process
+        * a few pixels the "slow" way to get the address aligned for
+        * the SSE fetch intsructions.
+        */
+
+       negl    %eax
+       andl    $15, %eax
+       sarl    $2, %eax
+
+       cmpl    %edx, %eax
+       cmovbe  %eax, %esi
+       subl    %esi, %edx
+
+       testl   $1, %esi
+       je      .L41
+
+       DO_ONE_PIXEL()  
+.L41:
+       testl   $2, %esi
+       je      .L40
+
+       movq    (%ebx), %xmm0
+       addl    $8, %ebx
+
+       movdqa  %xmm0, %xmm3
+       movdqa  %xmm0, %xmm4
+       andps   %xmm1, %xmm0
+
+       andps   %xmm2, %xmm3
+       pslldq  $2, %xmm4
+       psrldq  $2, %xmm3
+       andps   %xmm2, %xmm4
+
+       orps    %xmm4, %xmm3
+       orps    %xmm3, %xmm0
+
+       movq    %xmm0, (%ecx)
+       addl    $8, %ecx
+.L40:
+
+       /* Would it be worth having a specialized version of this loop for
+        * the case where the destination is 16-byte aligned?  That version
+        * would be identical except that it could use movedqa instead of
+        * movdqu.
+        */
+
+       movl    %edx, %eax
+       shrl    $2, %eax
+       jmp     .L42
+.L43:
+       movdqa  (%ebx), %xmm0
+       addl    $16, %ebx
+
+       movdqa  %xmm0, %xmm3
+       movdqa  %xmm0, %xmm4
+       andps   %xmm1, %xmm0
+
+       andps   %xmm2, %xmm3
+       pslldq  $2, %xmm4
+       psrldq  $2, %xmm3
+       andps   %xmm2, %xmm4
+
+       orps    %xmm4, %xmm3
+       orps    %xmm3, %xmm0
+
+       movdqu  %xmm0, (%ecx)
+       addl    $16, %ecx
+       subl    $1, %eax
+.L42:
+       jne     .L43
+
+
+       /* There may be upto 3 pixels remaining to be copied.  Take care
+        * of them now.  We do the 2 pixel case first because the data
+        * will be aligned.
+        */
+
+       testl   $2, %edx
+       je      .L47
+
+       movq    (%ebx), %xmm0
+
+       movdqa  %xmm0, %xmm3
+       movdqa  %xmm0, %xmm4
+       andps   %xmm1, %xmm0
+
+       andps   %xmm2, %xmm3
+       pslldq  $2, %xmm4
+       psrldq  $2, %xmm3
+       andps   %xmm2, %xmm4
+
+       orps    %xmm4, %xmm3
+       orps    %xmm3, %xmm0
+
+       movq    %xmm0, (%ecx)
+.L47:
+
+       testl   $1, %edx
+       je      .L46
+
+       DO_ONE_LAST_PIXEL()  
+.L46:
+
+       popl    %ebx
+       popl    %esi
+       ret
+       .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
diff --git a/src/mesa/x86/read_rgba_span_x86.h b/src/mesa/x86/read_rgba_span_x86.h

new file mode 100644 (file)

index 0000000..99dd0e3
--- /dev/null
+++ b/src/mesa/x86/read_rgba_span_x86.h
@@ -0,0 +1,53 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+ 
+/**
+ * \file read_rgba_span_x86.h
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#ifndef READ_RGBA_SPAN_X86_H
+#define READ_RGBA_SPAN_X86_H
+
+#if defined(USE_SSE_ASM) || defined(USE_MMX_ASM)
+#include "x86/common_x86_asm.h"
+#endif
+
+#if defined(USE_SSE_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_SSE2( const unsigned char *,
+    unsigned char *, unsigned );
+#endif
+
+#if defined(USE_SSE_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *,
+    unsigned char *, unsigned );
+#endif
+
+#if defined(USE_MMX_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *,
+    unsigned char *, unsigned );
+#endif
+
+#endif /* READ_RGBA_SPAN_X86_H */
author	Ian Romanick <idr@us.ibm.com>
	Thu, 14 Oct 2004 00:59:12 +0000 (00:59 +0000)
committer	Ian Romanick <idr@us.ibm.com>
	Thu, 14 Oct 2004 00:59:12 +0000 (00:59 +0000)
src/mesa/drivers/dri/common/spantmp2.h	[new file with mode: 0644]	patch \| blob
src/mesa/drivers/dri/r128/r128_span.c		patch \| blob \| history
src/mesa/drivers/dri/r200/r200_span.c		patch \| blob \| history
src/mesa/drivers/dri/unichrome/via_span.c		patch \| blob \| history
src/mesa/sources		patch \| blob \| history
src/mesa/x86/read_rgba_span_x86.S	[new file with mode: 0644]	patch \| blob
src/mesa/x86/read_rgba_span_x86.h	[new file with mode: 0644]	patch \| blob