Added MMX optimized version of the RGB565 ReadRGBASpan routine.

author Ian Romanick <idr@us.ibm.com>

Tue, 2 Nov 2004 18:25:45 +0000 (18:25 +0000)

committer Ian Romanick <idr@us.ibm.com>

Tue, 2 Nov 2004 18:25:45 +0000 (18:25 +0000)
author Ian Romanick <idr@us.ibm.com>
Tue, 2 Nov 2004 18:25:45 +0000 (18:25 +0000)
committer Ian Romanick <idr@us.ibm.com>
Tue, 2 Nov 2004 18:25:45 +0000 (18:25 +0000)
diff --git a/src/mesa/drivers/dri/common/spantmp2.h b/src/mesa/drivers/dri/common/spantmp2.h

index 5a161b11ddf22ad0346ddcc254fbd90bb6405dc6..ce48257836dffff088d6954173db2e5d4e41234e 100644 (file)
--- a/src/mesa/drivers/dri/common/spantmp2.h
+++ b/src/mesa/drivers/dri/common/spantmp2.h
@@ -377,8 +377,10 @@ static void TAG(ReadRGBASpan)( const GLcontext *ctx,
  
  
  #if defined(USE_MMX_ASM) && \
-   (SPANTMP_PIXEL_FMT == GL_BGRA) && \
-     (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
+   (((SPANTMP_PIXEL_FMT == GL_BGRA) && \
+       (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)) || \
+    ((SPANTMP_PIXEL_FMT == GL_RGB) && \
+       (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)))
  static void TAG2(ReadRGBASpan,_MMX)( const GLcontext *ctx,
                                GLuint n, GLint x, GLint y,
                                GLubyte rgba[][4])
@@ -406,7 +408,12 @@ static void TAG2(ReadRGBASpan,_MMX)( const GLcontext *ctx,
  
                {
                   const char * src = GET_SRC_PTR( x1, y );
+#if (SPANTMP_PIXEL_FMT == GL_RGB) && \
+                 (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)
+                 _generic_read_RGBA_span_RGB565_MMX( src, rgba[i], n1 );
+#else
                   _generic_read_RGBA_span_BGRA8888_REV_MMX( src, rgba[i], n1 );
+#endif
                }
           }
         HW_ENDCLIPLOOP();
@@ -539,29 +546,34 @@ static void TAG(InitPointers)(struct swrast_device_driver *swdd)
     swdd->WriteMonoRGBAPixels = TAG(WriteMonoRGBAPixels);
     swdd->ReadRGBAPixels = TAG(ReadRGBAPixels);
  
-#if (SPANTMP_PIXEL_FMT == GL_BGRA) && \
+#if defined(USE_SSE_ASM) && \
+   (SPANTMP_PIXEL_FMT == GL_BGRA) && \
       (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
-#if defined(USE_SSE_ASM)
     if ( cpu_has_xmm2 ) {
        if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE2" );
        swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE2);
     }
     else
  #endif
-#if defined(USE_SSE_ASM)
+#if defined(USE_SSE_ASM) && \
+   (SPANTMP_PIXEL_FMT == GL_BGRA) && \
+     (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
     if ( cpu_has_xmm ) {
        if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE" );
        swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE);
     }
     else
  #endif
-#if defined(USE_MMX_ASM)
+#if defined(USE_MMX_ASM) && \
+   (((SPANTMP_PIXEL_FMT == GL_BGRA) && \
+       (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)) || \
+    ((SPANTMP_PIXEL_FMT == GL_RGB) && \
+       (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5)))
     if ( cpu_has_mmx ) {
        if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "MMX" );
        swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _MMX);
     }
     else
-#endif
  #endif
     {
        if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "C" );
diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S

index e637f22da378dbc9543daa18f78462f8bd754ce7..06bdc6d264ed61000527331081aabf8a5da4289f 100644 (file)
--- a/src/mesa/x86/read_rgba_span_x86.S
+++ b/src/mesa/x86/read_rgba_span_x86.S
@@ -451,3 +451,226 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE2:
         popl    %esi
         ret
         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+
+
+
+       .section        .rodata
+
+       .align  16
+mask_565:
+       .word   0xf800
+       .word   0x07e0
+       .word   0x001f
+       .word   0x0000
+
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
+ * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
+ * at a small cost to accuracy.
+ */
+
+#define SCALE_ADJUST   5
+#if SCALE_ADJUST == 5
+prescale:
+       .word   0x0001
+       .word   0x0010
+       .word   0x0200
+       .word   0x0000
+
+scale:
+       .word   0x20e8          /* (0x00ff0000 / 0x000007c0) + 1 */
+       .word   0x40c5          /* (0x00ff0000 / 0x000003f0) + 1 */
+       .word   0x839d          /* (0x00ff0000 / 0x000001f0) + 1 */
+       .word   0x0000
+#elif SCALE_ADJUST == 0
+prescale:
+       .word   0x0001
+       .word   0x0020
+       .word   0x0800
+       .word   0x0000
+
+scale:
+       .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
+       .word   0x0104          /* (0x00ff0000 / 0x0000fc00) + 1 */
+       .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
+       .word   0x0000
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+
+
+alpha: .long   0x00000000
+       .long   0x00ff0000
+
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+
+       .text
+       .globl  _generic_read_RGBA_span_RGB565_MMX
+       .type   _generic_read_RGBA_span_RGB565_MMX, @function
+
+_generic_read_RGBA_span_RGB565_MMX:
+
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+
+       movl    4(%esp), %eax   /* source pointer */
+       movl    8(%esp), %edx   /* destination pointer */
+       movl    12(%esp), %ecx  /* number of pixels to copy */
+
+       movq    mask_565, %mm5
+       movq    prescale, %mm6
+       movq    scale, %mm7
+
+       shrl    $2, %ecx
+       jmp     .L02
+
+.L03:
+       /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
+        * second pixels into the four words of %mm0 and %mm2.
+        */
+
+       movq    (%eax), %mm4
+       addl    $8, %eax
+
+       pshufw  $0x00, %mm4, %mm0
+       pshufw  $0x55, %mm4, %mm2
+
+
+       /* Mask the pixels so that each word of each register contains only
+        * one color component.
+        */
+
+       pand    %mm5, %mm0
+       pand    %mm5, %mm2
+
+
+       /* Adjust the component values so that they are as small as possible,
+        * but large enough so that we can multiply them by an unsigned 16-bit
+        * number and get a value as large as 0x00ff0000.
+        */
+
+       pmullw  %mm6, %mm0
+       pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+       psrlw   $SCALE_ADJUST, %mm2
+#endif
+
+       /* Scale the input component values to be on the range
+        * [0, 0x00ff0000].  This it the real magic of the whole routine.
+        */
+
+       pmulhuw %mm7, %mm0
+       pmulhuw %mm7, %mm2
+
+
+       /* Always set the alpha value to 0xff.
+        */
+
+       por     alpha, %mm0
+       por     alpha, %mm2
+
+
+       /* Pack the 16-bit values to 8-bit values and store the converted
+        * pixel data.
+        */
+
+       packuswb        %mm2, %mm0
+       movq    %mm0, (%edx)
+       addl    $8, %edx
+
+
+
+       pshufw  $0xaa, %mm4, %mm0
+       pshufw  $0xff, %mm4, %mm2
+
+       pand    %mm5, %mm0
+       pand    %mm5, %mm2
+       pmullw  %mm6, %mm0
+       pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+       psrlw   $SCALE_ADJUST, %mm2
+#endif
+       pmulhuw %mm7, %mm0
+       pmulhuw %mm7, %mm2
+
+       por     alpha, %mm0
+       por     alpha, %mm2
+
+       packuswb        %mm2, %mm0
+
+       movq    %mm0, (%edx)
+       addl    $8, %edx
+
+       subl    $1, %ecx
+.L02:
+       jne     .L03
+
+
+       /* At this point there can be at most 3 pixels left to process.  If
+        * there is either 2 or 3 left, process 2.
+         */
+
+       movl    12(%esp), %ecx
+       testl   $0x02, %ecx
+       je      .L04
+
+       movd    (%eax), %mm4
+       addl    $4, %eax
+
+       pshufw  $0x00, %mm4, %mm0
+       pshufw  $0x55, %mm4, %mm2
+
+       pand    %mm5, %mm0
+       pand    %mm5, %mm2
+       pmullw  %mm6, %mm0
+       pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+       psrlw   $SCALE_ADJUST, %mm2
+#endif
+       pmulhuw %mm7, %mm0
+       pmulhuw %mm7, %mm2
+
+       por     alpha, %mm0
+       por     alpha, %mm2
+
+       packuswb        %mm2, %mm0
+
+       movq    %mm0, (%edx)
+       addl    $8, %edx
+
+.L04:
+       /* At this point there can be at most 1 pixel left to process.
+        * Process it if needed.
+         */
+
+       testl   $0x01, %ecx
+       je      .L01
+
+       movzxw  (%eax), %ecx
+       movd    %ecx, %mm4
+
+       pshufw  $0x00, %mm4, %mm0
+
+       pand    %mm5, %mm0
+       pmullw  %mm6, %mm0
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+#endif
+       pmulhuw %mm7, %mm0
+
+       por     alpha, %mm0
+
+       packuswb        %mm0, %mm0
+
+       movd    %mm0, (%edx)
+
+.L01:
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+       ret
diff --git a/src/mesa/x86/read_rgba_span_x86.h b/src/mesa/x86/read_rgba_span_x86.h

index 99dd0e365d0b4d881abe28d816b05c9e5b53a4f1..564b1bb0f9e0671d478e94dbd4770282a7a6289d 100644 (file)
--- a/src/mesa/x86/read_rgba_span_x86.h
+++ b/src/mesa/x86/read_rgba_span_x86.h
@@ -48,6 +48,9 @@ extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *,
  #if defined(USE_MMX_ASM)
  extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *,
      unsigned char *, unsigned );
+
+extern void _generic_read_RGBA_span_RGB565_MMX( const unsigned char *,
+    unsigned char *, unsigned );
  #endif
  
  #endif /* READ_RGBA_SPAN_X86_H */
author	Ian Romanick <idr@us.ibm.com>
	Tue, 2 Nov 2004 18:25:45 +0000 (18:25 +0000)
committer	Ian Romanick <idr@us.ibm.com>
	Tue, 2 Nov 2004 18:25:45 +0000 (18:25 +0000)
src/mesa/drivers/dri/common/spantmp2.h		patch \| blob \| history
src/mesa/x86/read_rgba_span_x86.S		patch \| blob \| history
src/mesa/x86/read_rgba_span_x86.h		patch \| blob \| history