intel: Bump driver date to reflect status as final Q4 driver RC

[mesa.git] / src / mesa / x86 / read_rgba_span_x86.S
diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S

index e637f22da378dbc9543daa18f78462f8bd754ce7..92b1c2d902d6dae5259712054bdaae6a9bc2a0d6 100644 (file)
--- a/src/mesa/x86/read_rgba_span_x86.S
+++ b/src/mesa/x86/read_rgba_span_x86.S
@@ -31,22 +31,24 @@
   */
  
         .file   "read_rgba_span_x86.S"
-       .section        .rodata
-       .align 16
-       .type   mask, @object
-       .size   mask, 32
-mask:
-       .long   0xff00ff00
-       .long   0xff00ff00
-       .long   0xff00ff00
-       .long   0xff00ff00
-       .long   0x00ff0000
-       .long   0x00ff0000
-       .long   0x00ff0000
-       .long   0x00ff0000
-
-
-/* I implemented these as macros because the appear in quite a few places,
+#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
+/* Kevin F. Quinn 2nd July 2006
+ * Replaced data segment constants with text-segment instructions.
+ */
+#define        LOAD_MASK(mvins,m1,m2) \
+       pushl   $0xff00ff00 ;\
+       pushl   $0xff00ff00 ;\
+       pushl   $0xff00ff00 ;\
+       pushl   $0xff00ff00 ;\
+       mvins   (%esp), m1      ;\
+       pushl   $0x00ff0000 ;\
+       pushl   $0x00ff0000 ;\
+       pushl   $0x00ff0000 ;\
+       pushl   $0x00ff0000 ;\
+       mvins   (%esp), m2      ;\
+       addl    $32, %esp
+
+/* I implemented these as macros because they appear in several places,
   * and I've tweaked them a number of times.  I got tired of changing every
   * place they appear. :)
   */
@@ -75,6 +77,7 @@ mask:
   */
  
  .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  _generic_read_RGBA_span_BGRA8888_REV_MMX:
         pushl   %ebx
@@ -82,15 +85,14 @@ _generic_read_RGBA_span_BGRA8888_REV_MMX:
  #ifdef USE_INNER_EMMS
         emms
  #endif
-       movq    mask, %mm1
-       movq    mask+16, %mm2
+       LOAD_MASK(movq,%mm1,%mm2)
  
         movl    8(%esp), %ebx   /* source pointer */
         movl    16(%esp), %edx  /* number of pixels to copy */
         movl    12(%esp), %ecx  /* destination pointer */
  
         testl   %edx, %edx
-       je      .L20            /* Bail if there's nothing to do. */
+       jle     .L20            /* Bail if there's nothing to do. */
  
         movl    %ebx, %eax
  
@@ -170,6 +172,7 @@ _generic_read_RGBA_span_BGRA8888_REV_MMX:
   */
  
  .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
  _generic_read_RGBA_span_BGRA8888_REV_SSE:
         pushl   %esi
@@ -179,13 +182,16 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE:
  #ifdef USE_INNER_EMMS
         emms
  #endif
-       movq    mask, %mm1
-       movq    mask+16, %mm2
+
+       LOAD_MASK(movq,%mm1,%mm2)
  
         movl    16(%esp), %ebx  /* source pointer */
         movl    24(%esp), %edx  /* number of pixels to copy */
         movl    20(%esp), %ecx  /* destination pointer */
  
+       testl   %edx, %edx
+       jle     .L35            /* Bail if there's nothing to do. */
+
         movl    %esp, %ebp
         subl    $16, %esp
         andl    $0xfffffff0, %esp
@@ -329,13 +335,13 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE:
  
         .text
  .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
  _generic_read_RGBA_span_BGRA8888_REV_SSE2:
         pushl   %esi
         pushl   %ebx
  
-       movdqa  mask, %xmm1
-       movdqa  mask+16, %xmm2
+       LOAD_MASK(movdqu,%xmm1,%xmm2)
  
         movl    12(%esp), %ebx  /* source pointer */
         movl    20(%esp), %edx  /* number of pixels to copy */
@@ -344,6 +350,9 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE2:
         movl    %ebx, %eax
         movl    %edx, %esi
  
+       testl   %edx, %edx
+       jle     .L46            /* Bail if there's nothing to do. */
+
         /* If the source pointer isn't a multiple of 16 we have to process
          * a few pixels the "slow" way to get the address aligned for
          * the SSE fetch intsructions.
@@ -425,7 +434,8 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE2:
         je      .L47
  
         movq    (%ebx), %xmm0
-
+       addl    $8, %ebx
+        
         movdqa  %xmm0, %xmm3
         movdqa  %xmm0, %xmm4
         andps   %xmm1, %xmm0
@@ -439,6 +449,7 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE2:
         orps    %xmm3, %xmm0
  
         movq    %xmm0, (%ecx)
+       addl    $8, %ecx        
  .L47:
  
         testl   $1, %edx
@@ -451,3 +462,217 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE2:
         popl    %esi
         ret
         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+
+
+
+#define MASK_565_L     0x07e0f800
+#define MASK_565_H     0x0000001f
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the
+ * classic C implementation in Mesa.  Setting SCALE_ADJUST
+ * to 0 is slightly faster but at a small cost to accuracy.
+ */
+#define SCALE_ADJUST   5
+#if SCALE_ADJUST == 5
+#define PRESCALE_L 0x00100001
+#define PRESCALE_H 0x00000200
+#define SCALE_L 0x40C620E8
+#define SCALE_H 0x0000839d
+#elif SCALE_ADJUST == 0
+#define PRESCALE_L 0x00200001
+#define PRESCALE_H 0x00000800
+#define SCALE_L 0x01040108
+#define SCALE_H 0x00000108
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+#define ALPHA_L 0x00000000
+#define ALPHA_H 0x00ff0000
+
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+
+       .text
+       .globl  _generic_read_RGBA_span_RGB565_MMX
+        .hidden _generic_read_RGBA_span_RGB565_MMX
+       .type   _generic_read_RGBA_span_RGB565_MMX, @function
+
+_generic_read_RGBA_span_RGB565_MMX:
+
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+
+       movl    4(%esp), %eax   /* source pointer */
+       movl    8(%esp), %edx   /* destination pointer */
+       movl    12(%esp), %ecx  /* number of pixels to copy */
+
+       pushl   $MASK_565_H
+       pushl   $MASK_565_L
+       movq    (%esp), %mm5
+       pushl   $PRESCALE_H
+       pushl   $PRESCALE_L
+       movq    (%esp), %mm6
+       pushl   $SCALE_H
+       pushl   $SCALE_L
+       movq    (%esp), %mm7
+       pushl   $ALPHA_H
+       pushl   $ALPHA_L
+       movq    (%esp), %mm3
+       addl    $32,%esp
+
+       sarl    $2, %ecx
+       jl      .L01            /* Bail early if the count is negative. */
+       jmp     .L02
+
+.L03:
+       /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
+        * second pixels into the four words of %mm0 and %mm2.
+        */
+
+       movq    (%eax), %mm4
+       addl    $8, %eax
+
+       pshufw  $0x00, %mm4, %mm0
+       pshufw  $0x55, %mm4, %mm2
+
+
+       /* Mask the pixels so that each word of each register contains only
+        * one color component.
+        */
+
+       pand    %mm5, %mm0
+       pand    %mm5, %mm2
+
+
+       /* Adjust the component values so that they are as small as possible,
+        * but large enough so that we can multiply them by an unsigned 16-bit
+        * number and get a value as large as 0x00ff0000.
+        */
+
+       pmullw  %mm6, %mm0
+       pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+       psrlw   $SCALE_ADJUST, %mm2
+#endif
+
+       /* Scale the input component values to be on the range
+        * [0, 0x00ff0000].  This it the real magic of the whole routine.
+        */
+
+       pmulhuw %mm7, %mm0
+       pmulhuw %mm7, %mm2
+
+
+       /* Always set the alpha value to 0xff.
+        */
+
+       por %mm3, %mm0
+       por %mm3, %mm2
+
+
+       /* Pack the 16-bit values to 8-bit values and store the converted
+        * pixel data.
+        */
+
+       packuswb        %mm2, %mm0
+       movq    %mm0, (%edx)
+       addl    $8, %edx
+
+       pshufw  $0xaa, %mm4, %mm0
+       pshufw  $0xff, %mm4, %mm2
+
+       pand    %mm5, %mm0
+       pand    %mm5, %mm2
+       pmullw  %mm6, %mm0
+       pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+       psrlw   $SCALE_ADJUST, %mm2
+#endif
+       pmulhuw %mm7, %mm0
+       pmulhuw %mm7, %mm2
+
+       por %mm3, %mm0
+       por %mm3, %mm2
+
+       packuswb        %mm2, %mm0
+
+       movq    %mm0, (%edx)
+       addl    $8, %edx
+
+       subl    $1, %ecx
+.L02:
+       jne     .L03
+
+
+       /* At this point there can be at most 3 pixels left to process.  If
+        * there is either 2 or 3 left, process 2.
+         */
+
+       movl    12(%esp), %ecx
+       testl   $0x02, %ecx
+       je      .L04
+
+       movd    (%eax), %mm4
+       addl    $4, %eax
+
+       pshufw  $0x00, %mm4, %mm0
+       pshufw  $0x55, %mm4, %mm2
+
+       pand    %mm5, %mm0
+       pand    %mm5, %mm2
+       pmullw  %mm6, %mm0
+       pmullw  %mm6, %mm2
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+       psrlw   $SCALE_ADJUST, %mm2
+#endif
+       pmulhuw %mm7, %mm0
+       pmulhuw %mm7, %mm2
+
+       por %mm3, %mm0
+       por %mm3, %mm2
+
+       packuswb        %mm2, %mm0
+
+       movq    %mm0, (%edx)
+       addl    $8, %edx
+
+.L04:
+       /* At this point there can be at most 1 pixel left to process.
+        * Process it if needed.
+         */
+
+       testl   $0x01, %ecx
+       je      .L01
+
+       movzxw  (%eax), %ecx
+       movd    %ecx, %mm4
+
+       pshufw  $0x00, %mm4, %mm0
+
+       pand    %mm5, %mm0
+       pmullw  %mm6, %mm0
+#if SCALE_ADJUST > 0
+       psrlw   $SCALE_ADJUST, %mm0
+#endif
+       pmulhuw %mm7, %mm0
+
+       por %mm3, %mm0
+
+       packuswb        %mm0, %mm0
+
+       movd    %mm0, (%edx)
+
+.L01:
+#ifdef USE_INNER_EMMS
+       emms
+#endif
+       ret
+#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
+       
+#if defined (__ELF__) && defined (__linux__)
+       .section .note.GNU-stack,"",%progbits
+#endif