From 55d9ee83b4c29e8f7c373ee6326bbb4f77402bee Mon Sep 17 00:00:00 2001
From: Jose Fonseca <j_r_fonseca@yahoo.co.uk>
Date: Thu, 18 Apr 2002 11:57:28 +0000
Subject: [PATCH] Definition of several utility macros for self-contained MMX
 operations such as scaling and lerping. Restructured the MMX blending
 function to use a template, being only necessary to specify the main loop,
 which is also used for making the runin and runout sections. Optimization of
 the MMX function after remembering that the multiplication was commutative
 (how can somebody forget this..) resulting in less register usage. Now there
 is no need for generate or read from memory any constant inside the loop.

Assemblers other than the GNU assembler can choke on the output of the C preprocessor since it was necessary to add line separators ';' to the defined macros.
---
 src/mesa/x86/mmx_blend.S    | 531 +++++++++++++++---------------------
 src/mesa/x86/mmx_blendtmp.h | 113 ++++++++
 2 files changed, 339 insertions(+), 305 deletions(-)
 create mode 100644 src/mesa/x86/mmx_blendtmp.h

diff --git a/src/mesa/x86/mmx_blend.S b/src/mesa/x86/mmx_blend.S
index e679aa7bc79..f80cbf6c45e 100644
--- a/src/mesa/x86/mmx_blend.S
+++ b/src/mesa/x86/mmx_blend.S
@@ -4,8 +4,10 @@
 
 #include "matypes.h"
 
-/*
- * make the following approximation to the division (Sree)
+
+/* integer multiplication - alpha plus one
+ *
+ * makes the following approximation to the division (Sree)
  *
  *   rgb*a/255 ~= (rgb*(a+1)) >> 256
  *
@@ -13,12 +15,24 @@
  *
  *   0*0 = 0 and 255*255 = 255
  *
- * note this one should be used alone
+ * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
+ *
+ *   PCMPEQW    ( MX1, MX1 )
  */
-#define GMBT_ALPHA_PLUS_ONE	0
-
-/*
- * take the geometric series approximation to the division
+#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
+    PSUBW      ( MX1, MA1 )			/*   a1 + 1  |   a1 + 1  |   a1 + 1  |   a1 + 1  */	;\
+TWO(PSUBW      ( MX1, MA2 ))			/*   a2 + 1  |   a2 + 1  |   a2 + 1  |   a2 + 1  */	;\
+													;\
+    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*               t1 >> 8 ~= t1/255               */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*               t2 >> 8 ~= t2/255               */	
+
+
+/* integer multiplication - geometric series
+ *
+ * takes the geometric series approximation to the division
  *
  *   t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
  *
@@ -29,333 +43,240 @@
  * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, 
  * so the special case a = 255 must be accounted or roundoff must be used
  */
-#define GMBT_GEOMETRIC_SERIES	1
-
-/*
+#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
+    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+													;\
+    PSRLW      ( CONST(8), MP1 )		/*                    t1 >> 8                    */	;\
+TWO(PSRLW      ( CONST(8), MP2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* integer multiplication - geometric series plus rounding
+ *
  * when using a geometric series division instead of truncating the result 
  * use roundoff in the approximation (Jim Blinn)
  *
  *   t = rgb*a + 0x80
  *
  * achieving the exact results
+ *
+ * note that M80 is register with the 0x0080008000800080 constant
  */
-#define GMBT_ROUNDOFF		0
-
-/* instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
+#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
+    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
+													;\
+    PADDW      ( M80, MA1 )			/*                 t1 += 0x80                    */	;\
+TWO(PADDW      ( M80, MA2 ))			/*                 t2 += 0x80                    */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+													;\
+    PSRLW      ( CONST(8), MP1 )		/*                    t1 >> 8                    */	;\
+TWO(PSRLW      ( CONST(8), MP2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* linear interpolation - geometric series 
+ */
+#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
+TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
+													;\
+    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
+TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
+													;\
+    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+													;\
+    PSRLW      ( CONST(8), MP1 )		/*                    t1 >> 8                    */	;\
+TWO(PSRLW      ( CONST(8), MP2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+													;\
+    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
+TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* linear interpolation - geometric series with roundoff
+ *
+ * this is a generalization of Blinn's formula to signed arithmetic
+ *
+ * note that M80 is a register with the 0x0080008000800080 constant
+ */
+#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
+    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
+TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
+													;\
+    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
+TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
+													;\
+    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
+													;\
+    PSRLW      ( CONST(15), MP1 )		/*                 q1 > p1 ? 1 : 0               */	;\
+TWO(PSRLW      ( CONST(15), MP2 ))		/*                 q2 > q2 ? 1 : 0               */	;\
+													;\
+    PSLLW      ( CONST(8), MP1 )		/*             q1 > p1 ? 0x100 : 0               */	;\
+TWO(PSLLW      ( CONST(8), MP2 ))		/*             q2 > q2 ? 0x100 : 0               */	;\
+													;\
+    PSUBW      ( MP1, MA1 )			/*                  t1 -=? 0x100                 */	;\
+TWO(PSUBW      ( MP2, MA2 ))			/*                  t2 -=? 0x100                 */	;\
+ 													;\
+    PADDW      ( M80, MA1 )			/*                 t1 += 0x80                    */	;\
+TWO(PADDW      ( M80, MA2 ))			/*                 t2 += 0x80                    */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+													;\
+    PSRLW      ( CONST(8), MP1 )		/*                    t1 >> 8                    */	;\
+TWO(PSRLW      ( CONST(8), MP2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+													;\
+    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
+TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* linear interpolation - geometric series with correction
+ *
+ * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
  *
  *   t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
  *
  * note that although is faster than rounding off it doesn't give always the exact results
  */
-#define GMBT_GEOMETRIC_CORRECTION	1
-
-#if GMBT_ROUNDOFF
+#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
+TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
+													;\
+    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
+TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
+													;\
+    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+													;\
+    PSRLW      ( CONST(8), MP1 )		/*                    t1 >> 8                    */	;\
+TWO(PSRLW      ( CONST(8), MP2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+													;\
+    PSRLW      ( CONST(7), MP1 )		/*                    t1 >> 15                   */	;\
+TWO(PSRLW      ( CONST(7), MP2 ))		/*                    t2 >> 15                   */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*  t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8  */	;\
+													;\
+    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
+TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* common blending initialization code
+ */
+#if 0	/* rounding not used */
     SEG_DATA
 
 ALIGNDATA8
 const_80:
 	D_LONG 0x00800080, 0x00800080
-#endif 
-
-   SEG_TEXT
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_mmx_blend_transparency)
-
-/*
- * void blend_transparency( GLcontext *ctx,
- *                          GLuint n, 
- *                          const GLubyte mask[],
- *                          GLchan rgba[][4], 
- *                          CONST GLchan dest[][4] )
- * 
- * Common transparency blending mode.
- */
-GLNAME( _mesa_mmx_blend_transparency ):
-
-    PUSH_L     ( EBP )
-    MOV_L      ( ESP, EBP )
-    PUSH_L     ( ESI )
-    PUSH_L     ( EDI )
-    PUSH_L     ( EBX )
-
-    MOV_L      ( REGOFF(12, EBP), ECX )		/* n */
-    CMP_L      ( CONST(0), ECX)
-    JE         ( LLBL (GMBT_return) )
-
-    MOV_L      ( REGOFF(16, EBP), EBX )		/* mask */
-    MOV_L      ( REGOFF(20, EBP), EDI )         /* rgba */
-    MOV_L      ( REGOFF(24, EBP), ESI )         /* dest */
-    
-    TEST_L     ( CONST(4), EDI )		/* align rgba on an 8-byte boundary */
-    JZ         ( LLBL (GMBT_align_end) )
-
-    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
-    JE         ( LLBL (GMBT_align_continue) )
-
-    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
-
-    MOVD       ( REGIND(ESI), MM1 )		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */
-    MOVD       ( REGIND(EDI), MM2 )		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */
-
-    PUNPCKLBW  ( MM0, MM1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */
-    PUNPCKLBW  ( MM0, MM2 )			/*    pa1    |    pb1    |    pg1    |    pr1    */
-
-    MOVQ       ( MM2, MM3 )
-
-    PUNPCKHWD  ( MM3, MM3 )			/*    pa1    |    pa1    |           |           */
-    PUNPCKHDQ  ( MM3, MM3 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */
-
-#if GMBT_ALPHA_PLUS_ONE
-    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
-
-    PSUBW      ( MM4, MM3 )                     /*   pa1 + 1 |   pa1 + 1 |   pa1 + 1 |   pa1 + 1 */
-#endif
-
-    PSUBW      ( MM1, MM2 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
-
-    PSLLW      ( CONST(8), MM1 )		/*                    q1 << 8                    */
-
-#if GMBT_ROUNDOFF
-    MOVQ       ( MM2, MM4 )
-#endif
-
-    PMULLW     ( MM3, MM2 )			/*              t1 = (q1 - p1)*pa1               */
-
-#if GMBT_ROUNDOFF
-    PSRLW      ( CONST(15), MM4 )		/*                 q1 > p1 ? 1 : 0               */
-
-    PSLLW      ( CONST(8), MM4 )		/*             q1 > p1 ? 0x100 : 0               */
-
-    PSUBW      ( MM4, MM2 )                     /*                  t1 -=? 0x100                 */
-#endif
-
-#if GMBT_ROUNDOFF
-    MOVQ       ( CONTENT(const_80), MM4 )
-
-    PADDW      ( MM4, MM2 )                     /*                 t1 += 0x80                    */
-#endif
-
-#if GMBT_GEOMETRIC_SERIES
-    MOVQ       ( MM2, MM3 )
-
-    PSRLW      ( CONST(8), MM3 )		/*                    t1 >> 8                    */
-
-    PADDW      ( MM3, MM2 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */
-
-#if GMBT_GEOMETRIC_CORRECTION 
-    PSRLW      ( CONST(7), MM3 )		/*                    t1 >> 15                   */
-
-    PADDW      ( MM3, MM2 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */
-#endif
-#endif
-
-    PADDW      ( MM1, MM2 )			/*              (t1/255 + q1) << 8               */
-
-    PSRLW      ( CONST(8), MM2 )		/*    sa1    |    sb1    |    sg1    |    sr1    */
-    
-    PACKUSWB   ( MM0, MM2 )			/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */
-    MOVD       ( MM2, REGIND(EDI) )
-
-LLBL (GMBT_align_continue):
-
-    DEC_L      ( ECX )				/* n -= 1 */
-    INC_L      ( EBX )		                /* mask += 1 */
-    ADD_L      ( CONST(4), EDI )		/* rgba += 1 */
-    ADD_L      ( CONST(4), ESI )		/* dest += 1 */ 
-
-LLBL (GMBT_align_end):
-
-    CMP_L      ( CONST(2), ECX)
-    JB         ( LLBL (GMBT_loop_end) )
 
-ALIGNTEXT16
-LLBL (GMBT_loop_begin):
+#define GMB_INIT( M00, M80 ) \
+    PXOR       ( M00, M00 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
+    MOVQ       ( CONTENT(const_80), M80 )	/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
 
-    CMP_W      ( CONST(0), REGIND(EBX) )	/* *mask == 0 && *(mask + 1) == 0 */
-    JE         ( LLBL (GMBT_loop_continue) )
+#else
 
-    /* NOTE: the instruction pairing when multiple pipelines are available must be checked */
+#define GMB_INIT( M00 ) \
+    PXOR       ( M00, M00 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
 
-    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
-
-    MOVQ       ( REGIND(ESI), MM7 )		/* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */
-    MOVQ       ( REGIND(EDI), MM6 )		/* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
-
-    MOVQ       ( MM7, MM1 )
-    MOVQ       ( MM6, MM2 )
-
-    PUNPCKLBW  ( MM0, MM1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */
-    PUNPCKHBW  ( MM0, MM7 )                     /*    qa2    |    qb2    |    qg2    |    qr2    */
-    PUNPCKLBW  ( MM0, MM2 )			/*    pa1    |    pb1    |    pg1    |    pr1    */
-    PUNPCKHBW  ( MM0, MM6 )                     /*    pa2    |    pb2    |    pg2    |    pr2    */
-
-    MOVQ       ( MM2, MM3 )
-    MOVQ       ( MM6, MM5 )
-
-    PUNPCKHWD  ( MM3, MM3 )			/*    pa1    |    pa1    |           |           */
-    PUNPCKHWD  ( MM5, MM5 )			/*    pa2    |    pa2    |           |           */
-    PUNPCKHDQ  ( MM3, MM3 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */
-    PUNPCKHDQ  ( MM5, MM5 )                     /*    pa2    |    pa2    |    pa2    |    pa2    */
-
-#if GMBT_ALPHA_PLUS_ONE
-    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
-
-    PSUBW      ( MM4, MM3 )                     /*   pa1 + 1 |   pa1 + 1 |   pa1 + 1 |   pa1 + 1 */
-    PSUBW      ( MM4, MM5 )                     /*   pa2 + 1 |   pa2 + 1 |   pa2 + 1 |   pa2 + 1 */
-#endif
-
-    PSUBW      ( MM1, MM2 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
-    PSUBW      ( MM7, MM6 )                     /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */
-
-    PSLLW      ( CONST(8), MM1 )		/*                    q1 << 8                    */
-    PSLLW      ( CONST(8), MM7 )		/*                    q2 << 8                    */
-
-#if GMBT_ROUNDOFF
-    MOVQ       ( MM2, MM0 )
-    MOVQ       ( MM6, MM4 )
-#endif
-
-    PMULLW     ( MM3, MM2 )			/*              t1 = (q1 - p1)*pa1               */
-    PMULLW     ( MM5, MM6 )			/*              t2 = (q2 - p2)*pa2               */
-
-#if GMBT_ROUNDOFF
-    PSRLW      ( CONST(15), MM0 )		/*                 q1 > p1 ? 1 : 0               */
-    PSRLW      ( CONST(15), MM4 )		/*                 q2 > q2 ? 1 : 0               */
-
-    PSLLW      ( CONST(8), MM0 )		/*             q1 > p1 ? 0x100 : 0               */
-    PSLLW      ( CONST(8), MM4 )		/*             q2 > q2 ? 0x100 : 0               */
-
-    PSUBW      ( MM0, MM2 )                     /*                  t1 -=? 0x100                 */
-    PSUBW      ( MM4, MM7 )                     /*                  t2 -=? 0x100                 */ 
-#endif
-
-#if GMBT_ROUNDOFF
-    MOVQ       ( CONTENT(const_80), MM4 )
-
-    PADDW      ( MM4, MM2 )                     /*                 t1 += 0x80                    */
-    PADDW      ( MM4, MM6 )                     /*                 t2 += 0x80                    */
-#endif
-
-#if GMBT_GEOMETRIC_SERIES
-    MOVQ       ( MM2, MM3 )
-    MOVQ       ( MM6, MM5 )
-
-    PSRLW      ( CONST(8), MM3 )		/*                    t1 >> 8                    */
-    PSRLW      ( CONST(8), MM5 )		/*                    t2 >> 8                    */
-
-    PADDW      ( MM3, MM2 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */
-    PADDW      ( MM5, MM6 )			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */
-
-#if GMBT_GEOMETRIC_CORRECTION 
-    PSRLW      ( CONST(7), MM3 )		/*                    t1 >> 15                   */
-    PSRLW      ( CONST(7), MM5 )		/*                    t2 >> 15                   */
-
-    PADDW      ( MM3, MM2 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */
-    PADDW      ( MM5, MM6 )			/*  t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8  */
-#endif
-#endif
-
-    PADDW      ( MM1, MM2 )			/*              (t1/255 + q1) << 8               */
-    PADDW      ( MM7, MM6 )			/*              (t2/255 + q2) << 8               */
-
-    PSRLW      ( CONST(8), MM2 )		/*    sa1    |    sb1    |    sg1    |    sr1    */
-    PSRLW      ( CONST(8), MM6 )		/*    sa2    |    sb2    |    sg2    |    sr2    */
-    
-    PACKUSWB   ( MM6, MM2 )			/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
-    MOVQ       ( MM2, REGIND(EDI) )
-
-LLBL (GMBT_loop_continue):
-
-    DEC_L      ( ECX )
-    DEC_L      ( ECX )				/* n -= 2 */
-    ADD_L      ( CONST(2), EBX )		/* mask += 2 */
-    ADD_L      ( CONST(8), EDI )		/* rgba += 2 */
-    ADD_L      ( CONST(8), ESI )		/* dest += 2 */ 
-    CMP_L      ( CONST(2), ECX )
-    JAE        ( LLBL (GMBT_loop_begin) )
-
-LLBL (GMBT_loop_end):
-
-    CMP_L      ( CONST(1), ECX )
-    JB         ( LLBL (GMBT_done) )
-
-    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
-    JE         ( LLBL (GMBT_done) )
-
-    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
-
-    MOVD       ( REGIND(ESI), MM1 )		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */
-    MOVD       ( REGIND(EDI), MM2 )		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */
-
-    PUNPCKLBW  ( MM0, MM1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */
-    PUNPCKLBW  ( MM0, MM2 )			/*    pa1    |    pb1    |    pg1    |    pr1    */
-
-    MOVQ       ( MM2, MM3 )
-
-    PUNPCKHWD  ( MM3, MM3 )			/*    pa1    |    pa1    |           |           */
-    PUNPCKHDQ  ( MM3, MM3 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */
-
-#if GMBT_ALPHA_PLUS_ONE
-    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
-
-    PSUBW      ( MM4, MM3 )                     /*   pa1 + 1 |   pa1 + 1 |   pa1 + 1 |   pa1 + 1 */
-#endif
-
-    PSUBW      ( MM1, MM2 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
-
-    PSLLW      ( CONST(8), MM1 )		/*                    q1 << 8                    */
-
-#if GMBT_ROUNDOFF
-    MOVQ       ( MM2, MM4 )
 #endif
 
-    PMULLW     ( MM3, MM2 )			/*              t1 = (q1 - p1)*pa1               */
-
-#if GMBT_ROUNDOFF
-    PSRLW      ( CONST(15), MM4 )		/*                 q1 > p1 ? 1 : 0               */
-
-    PSLLW      ( CONST(8), MM4 )		/*             q1 > p1 ? 0x100 : 0               */
-
-    PSUBW      ( MM4, MM2 )                     /*                  t1 -=? 0x100                 */
-#endif
-
-#if GMBT_ROUNDOFF
-    MOVQ       ( CONTENT(const_80), MM4 )
-
-    PADDW      ( MM4, MM2 )                     /*                 t1 += 0x80                    */
-#endif
-
-#if GMBT_GEOMETRIC_SERIES
-    MOVQ       ( MM2, MM3 )
-
-    PSRLW      ( CONST(8), MM3 )		/*                    t1 >> 8                    */
+/* common blending loading code
+ *
+ * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
+ *
+ *   PXOR      ( M00, M00 )
+ */
+#define GMB_LOAD(rgba, dest, MP1, MQ1, MA1, MP2, MQ2, MA2, M00) \
+ONE(MOVD       ( REGIND(rgba), MP1 ))		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */	;\
+ONE(MOVD       ( REGIND(dest), MQ1 ))		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */	;\
+													;\
+TWO(MOVQ       ( REGIND(rgba), MP1 ))		/* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */	;\
+TWO(MOVQ       ( REGIND(dest), MQ1 ))		/* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */	;\
+													;\
+TWO(MOVQ       ( MP1, MP2 ))										;\
+TWO(MOVQ       ( MQ1, MQ2 ))										;\
+													;\
+    PUNPCKLBW  ( M00, MQ1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */	;\
+TWO(PUNPCKHBW  ( M00, MQ2 ))                    /*    qa2    |    qb2    |    qg2    |    qr2    */	;\
+    PUNPCKLBW  ( M00, MP1 )			/*    pa1    |    pb1    |    pg1    |    pr1    */	;\
+TWO(PUNPCKHBW  ( M00, MP2 ))                    /*    pa2    |    pb2    |    pg2    |    pr2    */	;\
+													;\
+    MOVQ       ( MP1, MA1 )										;\
+TWO(MOVQ       ( MP2, MA2 ))										;\
+													;\
+    PUNPCKHWD  ( MA1, MA1 )			/*    pa1    |    pa1    |           |           */	;\
+TWO(PUNPCKHWD  ( MA2, MA2 ))			/*    pa2    |    pa2    |           |           */	;\
+    PUNPCKHDQ  ( MA1, MA1 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */	;\
+TWO(PUNPCKHDQ  ( MA2, MA2 ))                    /*    pa2    |    pa2    |    pa2    |    pa2    */
+
+ 
+/* common blending storing code
+ */
+#define GMB_STORE(rgba, MA1, MA2) \
+    PACKUSWB   ( MA2, MA1 )			/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */	;\
+													;\
+ONE(MOVD       ( MA1, REGIND(rgba) ))									;\
+TWO(MOVQ       ( MA1, REGIND(rgba) ))
 
-    PADDW      ( MM3, MM2 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */
 
-#if GMBT_GEOMETRIC_CORRECTION 
-    PSRLW      ( CONST(7), MM3 )		/*                    t1 >> 15                   */
+   SEG_TEXT
 
-    PADDW      ( MM3, MM2 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */
-#endif
-#endif
 
-    PADDW      ( MM1, MM2 )			/*              (t1/255 + q1) << 8               */
+/* common transparency blending mode
+ */
 
-    PSRLW      ( CONST(8), MM2 )		/*    sa1    |    sb1    |    sg1    |    sr1    */
-    
-    PACKUSWB   ( MM0, MM2 )			/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */
-    MOVD       ( MM2, REGIND(EDI) )
+#define TAG(x) x##_transparency
 
-LLBL (GMBT_done):
+#define INIT \
+	GMB_INIT( MM0 )
 
-    EMMS
+#define MAIN \
+	GMB_LOAD( EDI, ESI, MM1, MM2, MM3, MM4, MM5, MM6, MM0)						;\
+	GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 )							;\
+	GMB_STORE( EDI, MM3, MM6 )
 
-LLBL (GMBT_return):
+#include "mmx_blendtmp.h"
 
-    POP_L      ( EBX )
-    POP_L      ( EDI )
-    POP_L      ( ESI )
-    MOV_L      ( EBP, ESP )
-    POP_L      ( EBP )
-    RET
diff --git a/src/mesa/x86/mmx_blendtmp.h b/src/mesa/x86/mmx_blendtmp.h
new file mode 100644
index 00000000000..395436ba015
--- /dev/null
+++ b/src/mesa/x86/mmx_blendtmp.h
@@ -0,0 +1,113 @@
+/*
+ * Written by José Fonseca <j_r_fonseca@yahoo.co.uk>
+ */
+
+
+/*
+ * void _mesa_mmx_blend( GLcontext *ctx,
+ *                       GLuint n, 
+ *                       const GLubyte mask[],
+ *                       GLchan rgba[][4], 
+ *                       CONST GLchan dest[][4] )
+ * 
+ */
+ALIGNTEXT16
+GLOBL GLNAME( TAG(_mesa_mmx_blend) )
+
+GLNAME( TAG(_mesa_mmx_blend) ):
+
+    PUSH_L     ( EBP )
+    MOV_L      ( ESP, EBP )
+    PUSH_L     ( ESI )
+    PUSH_L     ( EDI )
+    PUSH_L     ( EBX )
+
+    MOV_L      ( REGOFF(12, EBP), ECX )		/* n */
+    CMP_L      ( CONST(0), ECX)
+    JE         ( LLBL ( TAG(GMB_return) ) )
+
+    MOV_L      ( REGOFF(16, EBP), EBX )		/* mask */
+    MOV_L      ( REGOFF(20, EBP), EDI )         /* rgba */
+    MOV_L      ( REGOFF(24, EBP), ESI )         /* dest */
+
+    INIT
+    
+    TEST_L     ( CONST(4), EDI )		/* align rgba on an 8-byte boundary */
+    JZ         ( LLBL ( TAG(GMB_align_end) ) )
+
+    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
+    JE         ( LLBL ( TAG(GMB_align_continue) ) )
+
+    /* runin */
+#define ONE(x)	x
+#define TWO(x)  
+    MAIN
+#undef ONE
+#undef TWO
+
+LLBL ( TAG(GMB_align_continue) ):
+
+    DEC_L      ( ECX )				/* n -= 1 */
+    INC_L      ( EBX )		                /* mask += 1 */
+    ADD_L      ( CONST(4), EDI )		/* rgba += 1 */
+    ADD_L      ( CONST(4), ESI )		/* dest += 1 */ 
+
+LLBL ( TAG(GMB_align_end) ):
+
+    CMP_L      ( CONST(2), ECX)
+    JB         ( LLBL ( TAG(GMB_loop_end) ) )
+
+ALIGNTEXT16
+LLBL ( TAG(GMB_loop_begin) ):
+
+    CMP_W      ( CONST(0), REGIND(EBX) )	/* *mask == 0 && *(mask + 1) == 0 */
+    JE         ( LLBL ( TAG(GMB_loop_continue) ) )
+
+    /* main loop */
+#define ONE(x)
+#define TWO(x)	x
+    MAIN
+#undef ONE
+#undef TWO
+
+LLBL ( TAG(GMB_loop_continue) ):
+
+    DEC_L      ( ECX )
+    DEC_L      ( ECX )				/* n -= 2 */
+    ADD_L      ( CONST(2), EBX )		/* mask += 2 */
+    ADD_L      ( CONST(8), EDI )		/* rgba += 2 */
+    ADD_L      ( CONST(8), ESI )		/* dest += 2 */ 
+    CMP_L      ( CONST(2), ECX )
+    JAE        ( LLBL ( TAG(GMB_loop_begin) ) )
+
+LLBL ( TAG(GMB_loop_end) ):
+
+    CMP_L      ( CONST(1), ECX )
+    JB         ( LLBL ( TAG(GMB_done) ) )
+
+    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
+    JE         ( LLBL ( TAG(GMB_done) ) )
+
+    /* runout */
+#define ONE(x)	x
+#define TWO(x)
+    MAIN
+#undef ONE
+#undef TWO
+
+LLBL ( TAG(GMB_done) ):
+
+    EMMS
+
+LLBL ( TAG(GMB_return) ):
+
+    POP_L      ( EBX )
+    POP_L      ( EDI )
+    POP_L      ( ESI )
+    MOV_L      ( EBP, ESP )
+    POP_L      ( EBP )
+    RET
+
+#undef TAG
+#undef INIT
+#undef MAIN
-- 
2.30.2